From ad9fb49670e28414637761bac4b8e8940e2d6770 Mon Sep 17 00:00:00 2001 From: Rasmus Dahlberg Date: Thu, 23 Mar 2023 11:09:54 +0100 Subject: Automate handling of notice file Here's a hacky tool to migrate our ongoing v0.0.1 measurement once it's done. I.e., just split-up the NOTICE prints we have in collect.stdout, putting them in per-log notice files that happens automatically now. ``` // Package main provides a hacky tool that extracts NOTICE: prints // from a file collect.stdout, putting them in the logs data directories as // notice.txt. Only meant to migrate away from v0.0.1 that did not store // per-log notice files automatically, which makes things less error-prone. package main import ( "bytes" "encoding/json" "fmt" logger "log" "os" "strings" "gitlab.torproject.org/rgdd/ct/pkg/metadata" ) func main() { directory := "../data" logDirectory := fmt.Sprintf("%s/logs", directory) noticeFile := "../collect.stdout" b, err := os.ReadFile(fmt.Sprintf("%s/metadata.json", directory)) if err != nil { logger.Fatal(err) } var md metadata.Metadata if err := json.Unmarshal(b, &md); err != nil { logger.Fatal(err) } if b, err = os.ReadFile(noticeFile); err != nil { logger.Fatal(err) } lines := bytes.Split(b, []byte("\n")) for _, log := range logs(md) { id, _ := log.Key.ID() desc := *log.Description var notes []byte var numNotes int for _, line := range lines[:len(lines)-1] { if strings.Contains(string(line), fmt.Sprintf("NOTICE: %s", desc)) { notes = append(notes, line...) notes = append(notes, []byte("\n")...) numNotes += 1 } } if len(notes) == 0 { logger.Printf("%s: no notices", desc) continue } logger.Printf("%s: %d notices", desc, numNotes) if err := os.WriteFile(fmt.Sprintf("%s/%x/notice.txt", logDirectory, id[:]), notes, 0644); err != nil { logger.Fatal(err) } } } func logs(md metadata.Metadata) (logs []metadata.Log) { for _, operators := range md.Operators { for _, log := range operators.Logs { if log.Description == nil { logger.Printf("WARNING: skipping log without description") continue } if log.State == nil { continue // skip logs with unknown states } if log.State.Name == metadata.LogStatePending { continue // pending logs do not count towards CT-compliance } if log.State.Name == metadata.LogStateRetired { continue // retired logs are not necessarily reachable } if log.State.Name == metadata.LogStateRejected { continue // rejected logs do not count towards CT-compliance } logs = append(logs, log) } } return } ``` --- cmd_assemble.go | 51 ++++++++++++++++++++++++++++--------------------- cmd_collect.go | 29 ++++++++++++++++++++++------ internal/chunk/chunk.go | 2 ++ 3 files changed, 54 insertions(+), 28 deletions(-) diff --git a/cmd_assemble.go b/cmd_assemble.go index ae6af50..69a7173 100644 --- a/cmd_assemble.go +++ b/cmd_assemble.go @@ -26,7 +26,8 @@ func assemble(opts options) error { if err := json.Unmarshal(metadataBytes, &md); err != nil { return err } - var files []string + var sanFiles []string + var noticeFiles []string var sths []ct.SignedTreeHead for _, log := range logs(md) { id, _ := log.Key.ID() @@ -45,17 +46,18 @@ func assemble(opts options) error { return fmt.Errorf("%s: root hash mismatch") } - files = append(files, fmt.Sprintf("%s/%x/%s", opts.logDirectory, id[:], opts.sansFile)) + sanFiles = append(sanFiles, fmt.Sprintf("%s/%x/%s", opts.logDirectory, id[:], opts.sansFile)) + noticeFiles = append(noticeFiles, fmt.Sprintf("%s/%x/%s", opts.logDirectory, id[:], opts.noticeFile)) sths = append(sths, sth) } - logger.Printf("INFO: merging and de-duplicating %d input files with GNU sort", len(files)) + logger.Printf("INFO: merging and de-duplicating %d input files with GNU sort", len(sanFiles)) archiveDir := fmt.Sprintf("%s/%s-ct-sans", opts.archiveDirectory, now.Format("2006-01-02")) if err := os.MkdirAll(archiveDir, os.ModePerm); err != nil { return err } sansFile := fmt.Sprintf("%s/%s", archiveDir, opts.sansFile) - if err := dedup(opts, sansFile, files); err != nil { + if err := dedup(opts, sansFile, sanFiles); err != nil { return err } size, err := fileSize(sansFile) @@ -64,8 +66,26 @@ func assemble(opts options) error { } logger.Printf("INFO: created %s (%s)", sansFile, size) + logger.Printf("INFO: adding notice file") + var notes []byte + for _, noticeFile := range noticeFiles { + b, err := os.ReadFile(noticeFile) + if errors.Is(err, os.ErrNotExist) { + continue // no notes, great + } else if err != nil { + return err + } + + notes = append(notes, b...) + } + if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.noticeFile), notes, 0644); err != nil { + return err + } + numNotes := len(bytes.Split(notes, []byte("\n"))) - 1 + logger.Printf("INFO: %d notes in total\n", numNotes) + logger.Printf("INFO: adding README") - readme, err := makeREADME(opts, sths, now) + readme, err := makeREADME(opts, sths, numNotes, now) if err != nil { return err } @@ -116,24 +136,11 @@ func dedup(opts options, outputFile string, inputFiles []string) error { return nil } -func makeREADME(opts options, sths []ct.SignedTreeHead, now time.Time) (string, error) { +func makeREADME(opts options, sths []ct.SignedTreeHead, numNotes int, now time.Time) (string, error) { snapshotTime, err := readSnapshotTime(opts) if err != nil { return "", err } - - noticeFile := opts.Directory + "/" + opts.noticeFile - notice, err := noticeReport(noticeFile) - if err != nil { - // TODO: start writing notice prints to a separate file in data/ - // by default, then make this a hard error. This needs to be - // done manually now by grepping for NOTICE in collect.stdout. - logger.Printf("WARNING: could not find notice file, skipping") - notice = "UNKNOWN" - } else { - // TODO: save notice file - } - return fmt.Sprintf(`# ct-sans dataset Dataset assembled at %s. Contents: @@ -151,12 +158,12 @@ The signed [metadata file][] and tree heads were downloaded at [metadata file]: https://groups.google.com/a/chromium.org/g/ct-policy/c/IdbrdAcDQto In total, %d certificates were downloaded from %d CT logs; -%s certificates contained SANs that could not be parsed. +%d certificates contained SANs that could not be parsed. For more information about these errors, see %s. The SANs data set is sorted and de-duplicated, one SAN per line. -`, now.Format(time.UnixDate), opts.metadataFile, opts.metadataSignatureFile, opts.sthsFile, opts.sansFile, opts.noticeFile, - snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), notice, noticeFile), nil +`, now.Format(time.UnixDate), opts.metadataFile, opts.metadataSignatureFile, opts.sthsFile, opts.noticeFile, opts.sansFile, + snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), numNotes, opts.noticeFile), nil } func fileSize(name string) (string, error) { diff --git a/cmd_collect.go b/cmd_collect.go index 4d93271..742884a 100644 --- a/cmd_collect.go +++ b/cmd_collect.go @@ -3,7 +3,6 @@ package main import ( "container/heap" "context" - "crypto/sha256" "encoding/json" "fmt" logger "log" @@ -121,15 +120,18 @@ func collect(opts options) error { // chunk that a single sequencer can verify and persist // callback := func(eb scanner.EntryBatch) { - leafHashes := [][sha256.Size]byte{} + c := &chunk.Chunk{Start: eb.Start} for i := 0; i < len(eb.Entries); i++ { - leafHashes = append(leafHashes, merkle.HashLeafNode(eb.Entries[i].LeafInput)) + c.LeafHashes = append(c.LeafHashes, merkle.HashLeafNode(eb.Entries[i].LeafInput)) } - sans, errs := x509.SANsFromLeafEntries(eb.Start, eb.Entries) + + var errs []error + c.SANs, errs = x509.SANsFromLeafEntries(eb.Start, eb.Entries) for _, err := range errs { - logger.Printf("NOTICE: %s: %v", *log.Description, err) + c.Notes = append(c.Notes, fmt.Sprintf("NOTICE: %s: %v", *log.Description, err)) } - chunksCh <- &chunk.Chunk{eb.Start, leafHashes, sans} + + chunksCh <- c } if err := fetcher.Run(ctx, callback); err != nil { @@ -275,6 +277,21 @@ func persist(c *chunk.Chunk, return false, err } + // Persist notes to disk + if len(c.Notes) > 0 { + fpn, err := os.OpenFile(fmt.Sprintf("%s/%x/%s", opts.logDirectory, logID, opts.noticeFile), os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + return false, err + } + defer fpn.Close() + if _, err := fpn.WriteString(strings.Join(c.Notes, "\n") + "\n"); err != nil { + return false, err + } + if err := fpn.Sync(); err != nil { + return false, err + } + } + // Persist new tree state to disk b, err := json.Marshal(&newTH) if err != nil { diff --git a/internal/chunk/chunk.go b/internal/chunk/chunk.go index 7fccc9b..64adc76 100644 --- a/internal/chunk/chunk.go +++ b/internal/chunk/chunk.go @@ -18,6 +18,7 @@ type Chunk struct { Start int64 // index of first leaf in this chunk LeafHashes [][sha256.Size]byte // in-order leaf hashes in this chunk SANs []string // sans of all leaves in this chunk + Notes []string // notes about this chunk, e.g., errors } // ChunkHeap is a min-heap of chunks wrt. to start indices. Use TPush() and @@ -60,6 +61,7 @@ func (h *ChunkHeap) Sequence(start int64) bool { s.LeafHashes = append(s.LeafHashes, c.LeafHashes...) s.SANs = append(s.SANs, c.SANs...) + s.Notes = append(s.Notes, c.Notes...) } // Put back the largest in-order chunk we have -- cgit v1.2.3