diff options
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/manager/manager.go | 31 | ||||
| -rw-r--r-- | internal/metrics/metrics.go | 105 | ||||
| -rw-r--r-- | internal/monitor/backoff.go | 83 |
3 files changed, 168 insertions, 51 deletions
diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 90f6507..b839502 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "os" + "strings" "time" "gitlab.torproject.org/rgdd/ct/pkg/metadata" @@ -14,6 +15,7 @@ import ( "rgdd.se/silentct/internal/monitor" "rgdd.se/silentct/pkg/policy" "rgdd.se/silentct/pkg/storage" + "rgdd.se/silentct/pkg/storage/loglist" ) type Config struct { @@ -145,7 +147,7 @@ func (mgr *Manager) startupConfig() error { return err } mgr.mconfigCh <- monitor.MonitoredLog{Config: log, State: state} - mgr.Metrics.LogState(state) + mgr.Metrics.LogState(loglist.FormatLogName(log), state) } return nil } @@ -170,7 +172,7 @@ func (mgr *Manager) removeLogs(logs []metadata.Log) { state, _ := mgr.GetMonitorState(log) mgr.Logger.Infof("removing log %s with %d entries in its backlog\n", log.URL, state.TreeSize-state.NextIndex) mgr.mconfigCh <- monitor.MonitoredLog{Config: log} - mgr.Metrics.RemoveLogState(state) + mgr.Metrics.RemoveLogState(loglist.FormatLogName(log), state) } } @@ -187,7 +189,7 @@ func (mgr *Manager) addLogs(ctx context.Context, logs []metadata.Log) { mgr.Logger.Infof("bootstrapping log %s at next index 0\n", log.URL) } mgr.mconfigCh <- monitor.MonitoredLog{Config: log, State: state} - mgr.Metrics.LogState(state) + mgr.Metrics.LogState(loglist.FormatLogName(log), state) } } @@ -209,22 +211,39 @@ func (mgr *Manager) monitorJob(msg monitor.Event) error { if err := mgr.SetMonitorState(msg.State.LogID, msg.State); err != nil { return err } - mgr.Metrics.LogState(msg.State) for _, err := range msg.Errors { mgr.errorJob(err) } + + // no metrics update if the log has just been removed (final event) + name, err := mgr.Storage.LogList.LogName(msg.State.SignedTreeHead.LogID) + if err == nil { + mgr.Metrics.LogState(name, msg.State) + } return nil } func (mgr *Manager) alertJob() error { + // See if there are any new unexpected certificates alerts, err := mgr.Index.TriggerAlerts() if err != nil { return err } for _, alert := range alerts { - mgr.Logger.Noticef("certificate mis-issuance? No allowlisting for %s\n", alert.StoredAt) + mgr.Logger.Noticef("unexpected certificate: no allowlisting for crt_sans=\"%s\", see log_id=\"%x\" log_index=\"%d\"\n", strings.Join(alert.SANs, " "), alert.LogID, alert.LogIndex) + } + + // Update metrics for the current unexpected certificates + alerting := mgr.Storage.Index.Alerting() + var names []string + for _, alert := range alerting { + name, err := mgr.Storage.LogList.LogName(alert.LogID) + if err != nil { + name = "historic log" + } + names = append(names, name) } - mgr.Metrics.CertificateAlert(mgr.Storage.Index.Alerting()) + mgr.Metrics.UnexpectedCertificateCount(names, mgr.Storage.Index.Alerting()) return nil } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index c5ff0d6..aae46cd 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -1,55 +1,51 @@ package metrics import ( + "fmt" + "strings" + "github.com/prometheus/client_golang/prometheus" "rgdd.se/silentct/internal/monitor" "rgdd.se/silentct/pkg/storage/index" ) type Metrics struct { - logSize *prometheus.GaugeVec - logIndex *prometheus.GaugeVec - logTimestamp *prometheus.GaugeVec - certificateAlert *prometheus.GaugeVec - errorCounter prometheus.Counter - needRestart prometheus.Gauge + errorCounter prometheus.Counter + logIndex *prometheus.GaugeVec + logSize *prometheus.GaugeVec + logTimestamp *prometheus.GaugeVec + needRestart prometheus.Gauge + unexpectedCertificateCount *prometheus.GaugeVec } func NewMetrics(registry *prometheus.Registry) *Metrics { m := &Metrics{ - logSize: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "silentct_log_size", - Help: "The number of entries in the log.", + errorCounter: prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "silentct_error_counter", + Help: "The number of errors propagated to the main loop.", }, - []string{"id"}, ), logIndex: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "silentct_log_index", Help: "The next log entry to be downloaded.", }, - []string{"id"}, + []string{"log_id", "log_name"}, ), - logTimestamp: prometheus.NewGaugeVec( + logSize: prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "silentct_log_timestamp", - Help: "The log's UNIX timestamp in ms.", + Name: "silentct_log_size", + Help: "The number of entries in the log.", }, - []string{"id"}, + []string{"log_id", "log_name"}, ), - certificateAlert: prometheus.NewGaugeVec( + logTimestamp: prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "silentct_certificate_alert", - Help: "The time the certificate without allowlisting was found.", - }, - []string{"stored_at"}, - ), - errorCounter: prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "silentct_error_counter", - Help: "The number of errors propagated to the main loop.", + Name: "silentct_log_timestamp", + Help: "The log's UNIX timestamp in ms.", }, + []string{"log_id", "log_name"}, ), needRestart: prometheus.NewGauge( prometheus.GaugeOpts{ @@ -57,30 +53,55 @@ func NewMetrics(registry *prometheus.Registry) *Metrics { Help: "A non-zero value if the monitor needs restarting.", }, ), + unexpectedCertificateCount: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "silentct_unexpected_certificate_count", + Help: "Number of certificates without any allowlisting", + }, + []string{"log_id", "log_name", "log_index", "crt_sans"}, + ), } - - registry.MustRegister(m.logSize, m.logIndex, m.logTimestamp, m.certificateAlert, m.errorCounter, m.needRestart) + registry.MustRegister( + m.errorCounter, + m.logIndex, + m.logSize, + m.logTimestamp, + m.needRestart, + m.unexpectedCertificateCount, + ) return m } -func (m *Metrics) LogState(state monitor.State) { - id := state.LogID.Base64String() - m.logIndex.WithLabelValues(id).Set(float64(state.NextIndex)) - m.logSize.WithLabelValues(id).Set(float64(state.TreeSize)) - m.logTimestamp.WithLabelValues(id).Set(float64(state.Timestamp)) +func (m *Metrics) LogState(logName string, state monitor.State) { + labels := prometheus.Labels{ + "log_id": fmt.Sprintf("%x", state.LogID[:]), + "log_name": logName, + } + m.logIndex.With(labels).Set(float64(state.NextIndex)) + m.logSize.With(labels).Set(float64(state.TreeSize)) + m.logTimestamp.With(labels).Set(float64(state.Timestamp)) } -func (m *Metrics) RemoveLogState(state monitor.State) { - id := state.LogID.Base64String() - m.logIndex.Delete(prometheus.Labels{"id": id}) - m.logSize.Delete(prometheus.Labels{"id": id}) - m.logTimestamp.Delete(prometheus.Labels{"id": id}) +func (m *Metrics) RemoveLogState(logName string, state monitor.State) { + labels := prometheus.Labels{ + "log_id": fmt.Sprintf("%x", state.LogID[:]), + "log_name": logName, + } + m.logIndex.Delete(labels) + m.logSize.Delete(labels) + m.logTimestamp.Delete(labels) } -func (m *Metrics) CertificateAlert(alerts []index.CertificateInfo) { - m.certificateAlert.Reset() - for _, alert := range alerts { - m.certificateAlert.WithLabelValues(alert.StoredAt).Set(float64(alert.ObservedAt.Unix())) +func (m *Metrics) UnexpectedCertificateCount(logNames []string, alerts []index.CertificateInfo) { + m.unexpectedCertificateCount.Reset() + for i, alert := range alerts { + labels := prometheus.Labels{ + "crt_sans": strings.Join(alert.SANs, " "), + "log_id": fmt.Sprintf("%x", alert.LogID), + "log_name": logNames[i], + "log_index": fmt.Sprintf("%d", alert.LogIndex), + } + m.unexpectedCertificateCount.With(labels).Set(1) } } diff --git a/internal/monitor/backoff.go b/internal/monitor/backoff.go index 63c5f55..3bfff7e 100644 --- a/internal/monitor/backoff.go +++ b/internal/monitor/backoff.go @@ -2,6 +2,7 @@ package monitor import ( "context" + "time" ct "github.com/google/certificate-transparency-go" "github.com/google/certificate-transparency-go/client" @@ -15,6 +16,10 @@ import ( // // For reference on this issue, see: // https://github.com/google/certificate-transparency-go/issues/898 +// +// Update: retries was added for get-sth and proof fetching. +// Only because we need 3x queries that succeed in a row, and some logs seem to +// rate limit globally for all endpoints rather than per endpoint. type backoffClient struct { cli *client.LogClient } @@ -24,15 +29,75 @@ func (bc *backoffClient) BaseURI() string { } func (bc *backoffClient) GetSTH(ctx context.Context) (*ct.SignedTreeHead, error) { - return bc.cli.GetSTH(ctx) + backoff := 1 + for { + rsp, err := bc.cli.GetSTH(ctx) + if err == nil { + return rsp, nil + } + + jcErr, ok := err.(jsonclient.RspError) + if !ok { + return rsp, err + } + if jcErr.StatusCode != 429 { + return rsp, err + } + if err := sleep(ctx, time.Duration(backoff)*time.Second); err != nil { + return nil, err + } + if backoff < 32 { + backoff = 2 * backoff + } + } } func (bc *backoffClient) GetSTHConsistency(ctx context.Context, first, second uint64) ([][]byte, error) { - return bc.cli.GetSTHConsistency(ctx, first, second) + backoff := 1 + for { + rsp, err := bc.cli.GetSTHConsistency(ctx, first, second) + if err == nil { + return rsp, nil + } + + jcErr, ok := err.(jsonclient.RspError) + if !ok { + return rsp, err + } + if jcErr.StatusCode != 429 { + return rsp, err + } + if err := sleep(ctx, time.Duration(backoff)*time.Second); err != nil { + return nil, err + } + if backoff < 32 { + backoff = 2 * backoff + } + } } func (bc *backoffClient) GetProofByHash(ctx context.Context, hash []byte, treeSize uint64) (*ct.GetProofByHashResponse, error) { - return bc.cli.GetProofByHash(ctx, hash, treeSize) + backoff := 1 + for { + rsp, err := bc.cli.GetProofByHash(ctx, hash, treeSize) + if err == nil { + return rsp, nil + } + + jcErr, ok := err.(jsonclient.RspError) + if !ok { + return rsp, err + } + if jcErr.StatusCode != 429 { + return rsp, err + } + if err := sleep(ctx, time.Duration(backoff)*time.Second); err != nil { + return nil, err + } + if backoff < 32 { + backoff = 2 * backoff + } + } } func (bc *backoffClient) GetRawEntries(ctx context.Context, start, end int64) (*ct.GetEntriesResponse, error) { @@ -54,3 +119,15 @@ func (bc *backoffClient) GetRawEntries(ctx context.Context, start, end int64) (* } return rsp, err } + +func sleep(ctx context.Context, d time.Duration) error { + timer := time.NewTimer(d) + defer timer.Stop() + + select { + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + return nil + } +} |
