#!/bin/bash # # A script that generates alerts based on the the silentct-mon prometheus # metrics. Mainly meant as an example on how to define relevant alerts. # set -eu function notice() { echo "NOTICE: $*" >&2 } function die() { echo "FATAL: $*" >&2 exit 1 } #----------------------------------------------------------------------------------------- # Options #----------------------------------------------------------------------------------------- METRICS_AT=${METRICS_AT:-http://localhost:8080/metrics} ALERT_BACKLOG=${ALERT_BACKLOG:-65536} ALERT_FRESHNESS=${ALERT_FRESHNESS:-86400} #----------------------------------------------------------------------------------------- # Download the current prometheus metrics #----------------------------------------------------------------------------------------- metrics_file=$(mktemp) trap "rm -f $metrics_file" EXIT curl -so "$metrics_file" "$METRICS_AT" || die "failed retrieving metrics from $METRICS_AT" #----------------------------------------------------------------------------------------- # Parse metrics #----------------------------------------------------------------------------------------- declare -A log_index declare -A log_size declare -A log_timestamp declare -A certificate_alert while IFS= read -r line; do if [[ $line =~ ^# ]]; then continue # skip comments fi if [[ $line =~ ^silentct_log_index ]]; then id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') value=$(echo "$line" | awk '{print $NF}') log_index["$id"]=$value fi if [[ $line =~ ^silentct_log_size ]]; then id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') value=$(echo "$line" | awk '{print $NF}') log_size["$id"]=$value fi if [[ $line =~ ^silentct_log_timestamp ]]; then id=$(echo "$line" | grep -oP '(?<=id=")[^"]+') value=$(echo "$line" | awk '{print $NF}') log_timestamp["$id"]=$value fi if [[ $line =~ ^silentct_certificate_alert ]]; then stored_at=$(echo "$line" | grep -oP '(?<=stored_at=")[^"]+') observed_at=$(echo "$line" | awk '{print $NF}') certificate_alert["$stored_at"]=$observed_at fi done <"$metrics_file" line=$(grep "^silentct_need_restart" "$metrics_file") need_restart=$(echo $line | awk '{print $NF}') #----------------------------------------------------------------------------------------- # Output alerts #----------------------------------------------------------------------------------------- now=$(date +%s) for id in "${!log_size[@]}"; do backlog=$(awk "BEGIN {print ${log_size[$id]} - ${log_index[$id]}}") if awk "BEGIN {exit !($backlog - $ALERT_BACKLOG >= 0)}"; then notice "log $id -- backlog is at $backlog" fi unix_timestamp=$(awk "BEGIN {printf \"%.0f\", ${log_timestamp[$id]} / 1000}") if (( now - unix_timestamp >= ALERT_FRESHNESS )); then notice "log $id -- latest timestamp at $(date -d @$unix_timestamp)" fi done for stored_at in "${!certificate_alert[@]}"; do observed_at=$(awk "BEGIN {printf \"%.0f\", ${certificate_alert[$stored_at]}}") notice "(mis)-issued certificate? Observed at $(date -d @$observed_at) -- see $stored_at" done if [[ $need_restart != 0 ]]; then notice "silentct-mon needs to be restarted" fi