diff options
author | Rasmus Dahlberg <rgdd@glasklarteknik.se> | 2025-01-18 13:39:40 +0100 |
---|---|---|
committer | Rasmus Dahlberg <rgdd@glasklarteknik.se> | 2025-01-18 16:34:14 +0100 |
commit | 8a17817c61f14a727a1017a5bcd4b1ea82964528 (patch) | |
tree | ec0fa96bfc683e906413106f2db2b99d710dc389 | |
parent | 2d3b1f2cb0c05385c1702f1a7d74fa08d52c262f (diff) |
prometheus: Refine based on input from anarcatmain
https://gitlab.torproject.org/tpo/tpa/team/-/issues/40677
-rwxr-xr-x | contrib/silentct-check | 104 | ||||
-rw-r--r-- | docs/metrics.md | 83 | ||||
-rwxr-xr-x | integration/smoke-test | 4 | ||||
-rw-r--r-- | internal/manager/manager.go | 5 | ||||
-rw-r--r-- | internal/metrics/metrics.go | 80 | ||||
-rw-r--r-- | pkg/storage/index/index.go | 4 | ||||
-rw-r--r-- | pkg/storage/index/inmem.go | 16 | ||||
-rwxr-xr-x | scripts/silentct-check | 96 |
8 files changed, 220 insertions, 172 deletions
diff --git a/contrib/silentct-check b/contrib/silentct-check new file mode 100755 index 0000000..97aa2b2 --- /dev/null +++ b/contrib/silentct-check @@ -0,0 +1,104 @@ +#!/bin/bash + +# +# A script that outputs warnings from silentct-mon's Prometheus metrics. Mainly +# meant as an example, but may be useful for simple (periodic) one-shot checks. +# + +set -eu + +#------------------------------------------------------------------------------- +# Configuration +#------------------------------------------------------------------------------- +METRICS_AT=${METRICS_AT:-http://localhost:8080/metrics} +ALERT_BACKLOG=${ALERT_BACKLOG:-65536} +ALERT_FRESHNESS=${ALERT_FRESHNESS:-86400} + +#------------------------------------------------------------------------------- +# Helper functions +#------------------------------------------------------------------------------- +notice() { + echo "NOTICE: $*" >&2 +} + +die() { + echo "FATAL: $*" >&2 + exit 1 +} + +to_integer() { + printf "%.f" "$1" +} + +extract_label() { + local line=$1 + local label=$2 + echo "${line#*"$label"=}" | cut -d'"' -f2 +} + +extract_value() { + local line=$1 + echo "${line##* }" +} + +#------------------------------------------------------------------------------- +# Fetch metrics +#------------------------------------------------------------------------------- +metrics_file=$(mktemp) +trap 'rm -f $metrics_file' EXIT + +curl -so "$metrics_file" "$METRICS_AT" || die "failed fetching from $METRICS_AT" + +#------------------------------------------------------------------------------- +# Parse metrics +#------------------------------------------------------------------------------- +declare -A log_index # log ID -> log index +declare -A log_size # log ID -> log size +declare -A log_timestamp # log ID -> log timestamp + +need_restart=0 +num_unexpected_crt=0 + +while IFS= read -r line; do + [[ $line =~ ^# ]] && continue + + case "$line" in + silentct_log_index*) + key=$(extract_label "$line" "log_id") + value=$(to_integer "$(extract_value "$line")") + log_index["$key"]=$value + ;; + silentct_log_size*) + key=$(extract_label "$line" "log_id") + value=$(to_integer "$(extract_value "$line")") + log_size["$key"]=$value + ;; + silentct_log_timestamp*) + key=$(extract_label "$line" "log_id") + value=$(to_integer "$(extract_value "$line")") + log_timestamp["$key"]=$((value / 1000)) + ;; + silentct_need_restart*) + need_restart=$(extract_value "$line") + ;; + silentct_unexpected_certificate*) + num_unexpected_crt=$((num_unexpected_crt + 1)) + ;; + esac +done <"$metrics_file" + +#------------------------------------------------------------------------------- +# Output warnings +#------------------------------------------------------------------------------- +now=$(date +%s) + +for log_id in "${!log_size[@]}"; do + backlog=$((log_size[$log_id] - log_index[$log_id])) + elapsed=$((now - log_timestamp[$log_id])) + + ((backlog < ALERT_BACKLOG)) || notice "log $log_id -- backlog is at $backlog" + ((elapsed < ALERT_FRESHNESS)) || notice "log $log_id -- latest timestamp at $(date -d @"${log_timestamp[$log_id]}")" +done + +[[ $need_restart == 0 ]] || notice "silentct-mon needs to be restarted" +[[ $num_unexpected_crt == 0 ]] || notice "$num_unexpected_crt unexpected certificate(s)" diff --git a/docs/metrics.md b/docs/metrics.md index 1dea0ef..aac873e 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,59 +1,56 @@ # Metrics -The `silentct-mon` program emits Prometheus metrics -- enable using the `-m` -option. For a *bash example* of how to create appropriate alerts from these -Prometheus metrics, see [scripts/silentct-check](../scripts/silentct-check). +`silentct-mon` can output Prometheus metrics -- enable using the `-m` option. -## `"silentct_log_size"` +## Examples of useful alerts + + - **The monitor is falling behind on downloading a particular log**, e.g., + `silentct_log_size - silentct_log_index > 65536`. + - **The monitor hasn't seen a fresh timestamp from a particular log**, e.g., + `time() - silentct_log_timestamp > 24*60*60`. + - **The monitor needs restarting**, e.g., `silentct_need_restart != 0` + - **Unexpected certificates have been found**, e.g., + `silentct_unexpected_certificate_count > 0`. + +## `"silentct_error_counter"` ``` -# HELP silentct_log_size The number of entries in the log. -# TYPE silentct_log_size gauge -silentct_log_size{id="TnWjJ1yaEMM4W2zU3z9S6x3w4I4bjWnAsfpksWKaOd8="} 6.07308178e+08 +# HELP silentct_error_counter The number of errors propagated to the main loop. +# TYPE silentct_error_counter counter +silentct_error_counter 0 ``` -`id` is a unique log identifier in base64 (computed as in RFC 6962, §3.2). +Do not use for alerting, this metric is too noisy and currently used for debug. ## `"silentct_log_index"` ``` # HELP silentct_log_index The next log entry to be downloaded. # TYPE silentct_log_index gauge -silentct_log_index{id="TnWjJ1yaEMM4W2zU3z9S6x3w4I4bjWnAsfpksWKaOd8="} 6.07307424e+08 +silentct_log_index{log_id="4e75a3275c9a10c3385b6cd4df3f52eb1df0e08e1b8d69c0b1fa64b1629a39df"} 7.30980064e+08 ``` -`id` is a unique log identifier in base64 (computed as in RFC 6962, §3.2). +`log_id` is a unique log identifier in hex, computed as in RFC 6962 §3.2. -## `"silentct_log_timestamp"` - -``` -# HELP silentct_log_timestamp The log's UNIX timestamp in ms. -# TYPE silentct_log_timestamp gauge -silentct_log_timestamp{id="TnWjJ1yaEMM4W2zU3z9S6x3w4I4bjWnAsfpksWKaOd8="} 1.735992491111e+12 -``` - -`id` is a unique log identifier in base64 (computed as in RFC 6962, §3.2). - -## `"silentct_certificate_alert"` +## `"silentct_log_size"` ``` -# HELP silentct_certificate_alert The time the certificate without allowlisting was found. -# TYPE silentct_certificate_alert gauge -silentct_certificate_alert{stored_at="/path/to/state/crt_found/<log-hex-id>-<log-index>.json"} 1.735992551e+09 +# HELP silentct_log_size The number of entries in the log. +# TYPE silentct_log_size gauge +silentct_log_size{log_id="4e75a3275c9a10c3385b6cd4df3f52eb1df0e08e1b8d69c0b1fa64b1629a39df"} 7.31044085e+08 ``` -`stored_at` is where the log entry is stored on the monitor's local file system. -For convenience, the parsed log-entry certificate is also available as `.ascii`. +`log_id` is a unique log identifier in hex, computed as in RFC 6962 §3.2. -## `"silentct_error_counter"` +## `"silentct_log_timestamp"` ``` -# HELP silentct_error_counter The number of errors propagated to the main loop. -# TYPE silentct_error_counter counter -silentct_error_counter 0 +# HELP silentct_log_timestamp The log's UNIX timestamp in ms. +# TYPE silentct_log_timestamp gauge +silentct_log_timestamp{log_id="4e75a3275c9a10c3385b6cd4df3f52eb1df0e08e1b8d69c0b1fa64b1629a39df"} 1.737202578179e+12 ``` -Do not use for alerting, this metric is too noisy and currently used for debug. +`log_id` is a unique log identifier in hex, computed as in RFC 6962 §3.2. ## `"silentct_need_restart"` @@ -65,3 +62,27 @@ silentct_need_restart 0 Restarts are normally not needed; but here's a metric until the `silentct-mon` implementation can assure that all corner-cases are handled without restarts. + +## `"silentct_unexpected_certificate_count"` + +``` +# HELP silentct_unexpected_certificate_count Number of certificates without any allowlisting +# TYPE silentct_unexpected_certificate_count gauge +silentct_unexpected_certificate_count{crt_sans="example.org www.example.org",log_id="4e75a3275c9a10c3385b6cd4df3f52eb1df0e08e1b8d69c0b1fa64b1629a39df",log_index="1234"} 1 +``` + +`crt_sans` are the subject alternative names in the unexpected certificate, +space separated. + +`log_id` is a unique log identifier in hex, computed as in RFC 6962 §3.2. + +`log_index` specifies the log entry that contains the unexpected certificate. + +See `STATE_DIRECTORY/crt_found/<log_id>-<log_index>.*` for further details. The +`.json` file contains the downloaded log entry. The `.ascii` file contains the +parsed leaf certificate in a human-readable format to make debugging easier. + +Allowlist an unexpected certificate by ingesting it from a trusted certificate +requester. Alternatively: stop the monitor, manually move the unexpected +certificate from the "alerting" dictionary to the "legitimate" dictionary in +`STATE_DIRECTORY/crt_index.json`, save, and then start the monitor again. diff --git a/integration/smoke-test b/integration/smoke-test index a128592..dacecd0 100755 --- a/integration/smoke-test +++ b/integration/smoke-test @@ -86,7 +86,7 @@ next_domain=$(go run github.com/google/certificate-transparency-go/client/ctclie config "$next_domain" "$remove_keys" >"$dir/config.json" timeout 10s go run ../cmd/silentct-mon -c "$dir/config.json" -d "$dir/state" -C "dev:silentct" -p 1s -v DEBUG 2>&1 | tee "$dir/output.txt" grep -q -E '([1-9][0-9]*|[1-9] matches)' "$dir/output.txt" || die "expected at least one match" -grep -q -F '[NOTICE] certificate mis-issuance?' "$dir/output.txt" || die "expected notice about mis-issued certificate" +grep -q -F '[NOTICE] unexpected certificate:' "$dir/output.txt" || die "expected notice about unexpected certificate" pass "run the monitor and be warned of an unreported certificate" @@ -97,4 +97,4 @@ echo "---" >&2 echo "All smoke tests passed" >&2 echo "For interactive tests:" >&2 echo "go run ../cmd/silentct-mon -c "$dir/config.json" -d "$dir/state" -C "dev:silentct" -p 15s -m localhost:8080 -v DEBUG" >&2 -echo "ALERT_BACKLOG=0 ALERT_FRESHNESS=0 ../scripts/silentct-check" >&2 +echo "ALERT_BACKLOG=0 ALERT_FRESHNESS=0 ../contrib/silentct-check" >&2 diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 90f6507..d63d52d 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "os" + "strings" "time" "gitlab.torproject.org/rgdd/ct/pkg/metadata" @@ -222,9 +223,9 @@ func (mgr *Manager) alertJob() error { return err } for _, alert := range alerts { - mgr.Logger.Noticef("certificate mis-issuance? No allowlisting for %s\n", alert.StoredAt) + mgr.Logger.Noticef("unexpected certificate: no allowlisting for crt_sans=\"%s\", see log_id=\"%x\" log_index=\"%d\"\n", strings.Join(alert.SANs, " "), alert.LogID, alert.LogIndex) } - mgr.Metrics.CertificateAlert(mgr.Storage.Index.Alerting()) + mgr.Metrics.UnexpectedCertificateCount(mgr.Storage.Index.Alerting()) return nil } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index c5ff0d6..7a67a9c 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -1,55 +1,51 @@ package metrics import ( + "fmt" + "strings" + "github.com/prometheus/client_golang/prometheus" "rgdd.se/silentct/internal/monitor" "rgdd.se/silentct/pkg/storage/index" ) type Metrics struct { - logSize *prometheus.GaugeVec - logIndex *prometheus.GaugeVec - logTimestamp *prometheus.GaugeVec - certificateAlert *prometheus.GaugeVec - errorCounter prometheus.Counter - needRestart prometheus.Gauge + errorCounter prometheus.Counter + logIndex *prometheus.GaugeVec + logSize *prometheus.GaugeVec + logTimestamp *prometheus.GaugeVec + needRestart prometheus.Gauge + unexpectedCertificateCount *prometheus.GaugeVec } func NewMetrics(registry *prometheus.Registry) *Metrics { m := &Metrics{ - logSize: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "silentct_log_size", - Help: "The number of entries in the log.", + errorCounter: prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "silentct_error_counter", + Help: "The number of errors propagated to the main loop.", }, - []string{"id"}, ), logIndex: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "silentct_log_index", Help: "The next log entry to be downloaded.", }, - []string{"id"}, + []string{"log_id"}, ), - logTimestamp: prometheus.NewGaugeVec( + logSize: prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "silentct_log_timestamp", - Help: "The log's UNIX timestamp in ms.", + Name: "silentct_log_size", + Help: "The number of entries in the log.", }, - []string{"id"}, + []string{"log_id"}, ), - certificateAlert: prometheus.NewGaugeVec( + logTimestamp: prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "silentct_certificate_alert", - Help: "The time the certificate without allowlisting was found.", - }, - []string{"stored_at"}, - ), - errorCounter: prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "silentct_error_counter", - Help: "The number of errors propagated to the main loop.", + Name: "silentct_log_timestamp", + Help: "The log's UNIX timestamp in ms.", }, + []string{"log_id"}, ), needRestart: prometheus.NewGauge( prometheus.GaugeOpts{ @@ -57,30 +53,48 @@ func NewMetrics(registry *prometheus.Registry) *Metrics { Help: "A non-zero value if the monitor needs restarting.", }, ), + unexpectedCertificateCount: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "silentct_unexpected_certificate_count", + Help: "Number of certificates without any allowlisting", + }, + []string{"log_id", "log_index", "crt_sans"}, + ), } - - registry.MustRegister(m.logSize, m.logIndex, m.logTimestamp, m.certificateAlert, m.errorCounter, m.needRestart) + registry.MustRegister( + m.errorCounter, + m.logIndex, + m.logSize, + m.logTimestamp, + m.needRestart, + m.unexpectedCertificateCount, + ) return m } func (m *Metrics) LogState(state monitor.State) { - id := state.LogID.Base64String() + id := fmt.Sprintf("%x", state.LogID) m.logIndex.WithLabelValues(id).Set(float64(state.NextIndex)) m.logSize.WithLabelValues(id).Set(float64(state.TreeSize)) m.logTimestamp.WithLabelValues(id).Set(float64(state.Timestamp)) } func (m *Metrics) RemoveLogState(state monitor.State) { - id := state.LogID.Base64String() + id := fmt.Sprintf("%x", state.LogID) m.logIndex.Delete(prometheus.Labels{"id": id}) m.logSize.Delete(prometheus.Labels{"id": id}) m.logTimestamp.Delete(prometheus.Labels{"id": id}) } -func (m *Metrics) CertificateAlert(alerts []index.CertificateInfo) { - m.certificateAlert.Reset() +func (m *Metrics) UnexpectedCertificateCount(alerts []index.CertificateInfo) { + m.unexpectedCertificateCount.Reset() for _, alert := range alerts { - m.certificateAlert.WithLabelValues(alert.StoredAt).Set(float64(alert.ObservedAt.Unix())) + labels := prometheus.Labels{ + "crt_sans": strings.Join(alert.SANs, " "), + "log_id": fmt.Sprintf("%x", alert.LogID), + "log_index": fmt.Sprintf("%d", alert.LogIndex), + } + m.unexpectedCertificateCount.With(labels).Set(1) } } diff --git a/pkg/storage/index/index.go b/pkg/storage/index/index.go index 95eb04a..c85a9e9 100644 --- a/pkg/storage/index/index.go +++ b/pkg/storage/index/index.go @@ -56,7 +56,7 @@ func (ix *Index) AddChain(node string, pem []byte) error { var crtID CertificateID crtID.Set(chain[0]) path := fmt.Sprintf("%s/%s-%s.pem", ix.cfg.TrustDirectory, node, crtID) - if !ix.mem.addChain(crtID, path) { + if !ix.mem.addChain(path, crtID, chain[0].DNSNames) { return nil // duplicate } @@ -76,7 +76,7 @@ func (ix *Index) AddEntries(logID [sha256.Size]byte, entries []monitor.LogEntry) var crtID CertificateID crtID.Set(crt) path := fmt.Sprintf("%s/%x-%d.json", ix.cfg.MatchDirectory, logID[:], entry.LeafIndex) - if !ix.mem.addEntry(crtID, path) { + if !ix.mem.addEntry(path, crtID, crt.DNSNames, logID, entry.LeafIndex) { return nil // duplicate } if err := ioutil.CommitJSON(path, entry); err != nil { diff --git a/pkg/storage/index/inmem.go b/pkg/storage/index/inmem.go index ba48bc1..6184cad 100644 --- a/pkg/storage/index/inmem.go +++ b/pkg/storage/index/inmem.go @@ -16,8 +16,12 @@ func (crtID *CertificateID) Set(crt x509.Certificate) { } type CertificateInfo struct { - ObservedAt time.Time `json:"observed_at"` - StoredAt string `json:"stored_at"` + ObservedAt time.Time `json:"observed_at"` + StoredAt string `json:"stored_at"` + SerialNumber CertificateID `json:"serial_number"` + SANs []string `json:"crt_sans"` + LogID [32]byte `json:"log_id,omitempty"` + LogIndex uint64 `json:"log_index,omitempty"` } // index is an in-memory index of certificates @@ -65,12 +69,12 @@ func (ix *index) triggerAlerts(delay time.Duration) []CertificateInfo { return alerts } -func (ix *index) addChain(crtID CertificateID, path string) bool { +func (ix *index) addChain(path string, crtID CertificateID, sans []string) bool { if _, ok := ix.Legitimate[crtID]; ok { return false // we already marked this certificate as "good" } - entry := CertificateInfo{ObservedAt: time.Now(), StoredAt: path} + entry := CertificateInfo{ObservedAt: time.Now(), StoredAt: path, SerialNumber: crtID, SANs: sans} crtInfos := []CertificateInfo{entry} if v, ok := ix.Alerting[crtID]; ok { crtInfos = append(crtInfos, v...) @@ -84,8 +88,8 @@ func (ix *index) addChain(crtID CertificateID, path string) bool { return true // index updated such that this certificate is marked as "good" } -func (ix *index) addEntry(crtID CertificateID, path string) bool { - crtInfo := CertificateInfo{ObservedAt: time.Now(), StoredAt: path} +func (ix *index) addEntry(path string, crtID CertificateID, sans []string, logID [32]byte, logIndex uint64) bool { + crtInfo := CertificateInfo{ObservedAt: time.Now(), StoredAt: path, SerialNumber: crtID, SANs: sans, LogID: logID, LogIndex: logIndex} if _, ok := ix.Legitimate[crtID]; ok { return add(ix.Legitimate, crtID, crtInfo) } else if _, ok := ix.Alerting[crtID]; ok { diff --git a/scripts/silentct-check b/scripts/silentct-check deleted file mode 100755 index 2c64d67..0000000 --- a/scripts/silentct-check +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash - -# -# A script that generates alerts based on the the silentct-mon prometheus -# metrics. Mainly meant as an example on how to define relevant alerts. -# - -set -eu - -function notice() { - echo "NOTICE: $*" >&2 -} - -function die() { - echo "FATAL: $*" >&2 - exit 1 -} - -#----------------------------------------------------------------------------------------- -# Options -#----------------------------------------------------------------------------------------- -METRICS_AT=${METRICS_AT:-http://localhost:8080/metrics} -ALERT_BACKLOG=${ALERT_BACKLOG:-65536} -ALERT_FRESHNESS=${ALERT_FRESHNESS:-86400} - -#----------------------------------------------------------------------------------------- -# Download the current prometheus metrics -#----------------------------------------------------------------------------------------- -metrics_file=$(mktemp) -trap "rm -f $metrics_file" EXIT -curl -so "$metrics_file" "$METRICS_AT" || die "failed retrieving metrics from $METRICS_AT" - -#----------------------------------------------------------------------------------------- -# Parse metrics -#----------------------------------------------------------------------------------------- -declare -A log_index -declare -A log_size -declare -A log_timestamp -declare -A certificate_alert -while IFS= read -r line; do - if [[ $line =~ ^# ]]; then - continue # skip comments - fi - - if [[ $line =~ ^silentct_log_index ]]; then - id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') - value=$(echo "$line" | awk '{print $NF}') - log_index["$id"]=$value - fi - - if [[ $line =~ ^silentct_log_size ]]; then - id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') - value=$(echo "$line" | awk '{print $NF}') - log_size["$id"]=$value - fi - - if [[ $line =~ ^silentct_log_timestamp ]]; then - id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') - value=$(echo "$line" | awk '{print $NF}') - log_timestamp["$id"]=$value - fi - - if [[ $line =~ ^silentct_certificate_alert ]]; then - stored_at=$(echo "$line" | grep -oP '(?<=stored_at=")[^"]+') - observed_at=$(echo "$line" | awk '{print $NF}') - certificate_alert["$stored_at"]=$observed_at - fi -done <"$metrics_file" - -line=$(grep "^silentct_need_restart" "$metrics_file") -need_restart=$(echo $line | awk '{print $NF}') - -#----------------------------------------------------------------------------------------- -# Output alerts -#----------------------------------------------------------------------------------------- -now=$(date +%s) -for id in "${!log_size[@]}"; do - backlog=$(awk "BEGIN {print ${log_size[$id]} - ${log_index[$id]}}") - if awk "BEGIN {exit !($backlog - $ALERT_BACKLOG >= 0)}"; then - notice "log $id -- backlog is at $backlog" - fi - - unix_timestamp=$(awk "BEGIN {printf \"%.0f\", ${log_timestamp[$id]} / 1000}") - if (( now - unix_timestamp >= ALERT_FRESHNESS )); then - notice "log $id -- latest timestamp at $(date -d @$unix_timestamp)" - fi -done - -for stored_at in "${!certificate_alert[@]}"; do - observed_at=$(awk "BEGIN {printf \"%.0f\", ${certificate_alert[$stored_at]}}") - notice "(mis)-issued certificate? Observed at $(date -d @$observed_at) -- see $stored_at" -done - -if [[ $need_restart != 0 ]]; then - notice "silentct-mon needs to be restarted" -fi |