From 2bee2104c84628a68ef7124a1beefa4e1f98369e Mon Sep 17 00:00:00 2001 From: Rasmus Dahlberg Date: Mon, 20 Mar 2023 19:52:21 +0100 Subject: Add drafty assemble command --- README.md | 2 + cmd_assemble.go | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- main.go | 38 +++++++- 3 files changed, 316 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 078275d..7941e20 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,8 @@ UNIX tool `sort`. For the exact commands and an associated dataset manifest: sort -u ... ... + sort -Vuo sans.lst --buffer-size=1024K --temporary-directory=/tmp/t --parallel=2 a.lst b.lst + Note that you may need to tweak the number of CPUs, available memory, and temporary disk space to be suitable for your own system. diff --git a/cmd_assemble.go b/cmd_assemble.go index 246ba6e..f084193 100644 --- a/cmd_assemble.go +++ b/cmd_assemble.go @@ -1,7 +1,285 @@ package main -import "fmt" +import ( + "bufio" + "bytes" + "encoding/json" + "errors" + "fmt" + logger "log" + "os" + "os/exec" + "strconv" + "strings" + "time" + + "git.cs.kau.se/rasmoste/ct-sans/internal/sanitize" + ct "github.com/google/certificate-transparency-go" + "gitlab.torproject.org/rgdd/ct/pkg/metadata" +) func assemble(opts options) error { - return fmt.Errorf("TODO") + now := time.Now() + metadataBytes, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataFile)) + if err != nil { + return err + } + var md metadata.Metadata + if err := json.Unmarshal(metadataBytes, &md); err != nil { + return err + } + var files []string + var sths []ct.SignedTreeHead + for _, log := range logs(md) { + id, _ := log.Key.ID() + th, err := readState(opts, id[:]) + if err != nil { + return err + } + sth, err := readSnapshot(opts, id[:]) + if err != nil { + return err + } + if uint64(th.TreeSize) != sth.TreeSize { + return fmt.Errorf("%s: at tree size %d, want %d", *log.Description, th.TreeSize, sth.TreeSize) + } + if th.RootHash != sth.SHA256RootHash { + return fmt.Errorf("%s: root hash mismatch") + } + + files = append(files, fmt.Sprintf("%s/%x/%s", opts.logDirectory, id[:], opts.sansFile)) + sths = append(sths, sth) + } + + logger.Printf("INFO: merging and de-duplicating %d input files with GNU sort", len(files)) + archiveDir := fmt.Sprintf("%s/%s-ct-sans", opts.archiveDirectory, now.Format("2006-01-02")) + if err := os.MkdirAll(archiveDir, os.ModePerm); err != nil { + return err + } + sansFile := fmt.Sprintf("%s/%s", archiveDir, opts.sansFile) + if err := dedup(opts, sansFile, files); err != nil { + return err + } + size, err := fileSize(sansFile) + if err != nil { + return err + } + logger.Printf("INFO: created %s (%s)", sansFile, size) + + logger.Printf("INFO: sanitizing SANs to visit") + visitFile := fmt.Sprintf("%s/%s", archiveDir, opts.visitFile) + skip, tweak, err := sanitizeSANs(sansFile, visitFile) + if err != nil { + return err + } + logger.Printf("INFO: ruled out %d lines while sanitizing domains", skip) + logger.Printf("INFO: tweaked %d domains with \"*.\" prefixes", tweak) + logger.Printf("INFO: created %s", visitFile) + + logger.Printf("INFO: de-duplicating %s with GNU sort", visitFile) + wipFile := fmt.Sprintf("%s.tmp", visitFile) + if err := dedup(opts, wipFile, []string{visitFile}); err != nil { + return err + } + if err := os.Rename(wipFile, visitFile); err != nil { + return err + } + if size, err = fileSize(visitFile); err != nil { + return err + } + logger.Printf("INFO: %s is down to %s", visitFile, size) + + logger.Printf("INFO: adding README") + readme, err := makeREADME(opts, sths, skip, tweak, now) + if err != nil { + return err + } + if err := os.WriteFile(fmt.Sprintf("%s/README.md", archiveDir), []byte(readme), 0644); err != nil { + return err + } + + logger.Printf("INFO: adding signed metadata file") + sigBytes, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataSignatureFile)) + if err != nil { + return err + } + if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.metadataFile), metadataBytes, 0644); err != nil { + return err + } + if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.metadataSignatureFile), sigBytes, 0644); err != nil { + return err + } + + logger.Printf("INFO: adding signed tree heads") + sthsBytes, err := json.MarshalIndent(sths, "", "\t") + if err != nil { + return err + } + if err := os.WriteFile(fmt.Sprintf("%s/sths.json", archiveDir), sthsBytes, 0644); err != nil { + return err + } + + logger.Printf("INFO: uncompressed dataset available in %s", archiveDir) + return nil +} + +func dedup(opts options, outputFile string, inputFiles []string) error { + cmd := exec.Command("sort", append([]string{ + "-Vuo", outputFile, + "--buffer-size", fmt.Sprintf("%dG", opts.BufferSize), + "--temporary-directory", fmt.Sprintf("%s", opts.TempDir), + "--parallel", fmt.Sprintf("%d", opts.Parallel), + }, inputFiles...)...) + if errors.Is(cmd.Err, exec.ErrDot) { + cmd.Err = nil + } + stderr := bytes.NewBuffer(nil) + cmd.Stderr = stderr + if _, err := cmd.Output(); err != nil { + return fmt.Errorf("%s", string(stderr.Bytes())) + } + return nil +} + +func sanitizeSANs(sansFile, visitFile string) (int, int, error) { + fpSANs, err := os.OpenFile(sansFile, os.O_RDONLY, 0644) + if err != nil { + return 0, 0, err + } + defer fpSANs.Close() + scanner := bufio.NewScanner(fpSANs) + max := 128 * 1024 * 1024 + buf := make([]byte, 0, max) + scanner.Buffer(buf, max) + + fpVisit, err := os.OpenFile(visitFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + if err != nil { + return 0, 0, err + } + defer fpVisit.Close() + + skip := 0 + tweak := 0 + for scanner.Scan() { + domain, err := sanitize.SanitizeDNSName(scanner.Text()) + if err != nil { + skip += 1 + } + if strings.HasPrefix(domain, "*.") { + tweak += 1 + domain = domain[2:] + } + if _, err := fpVisit.WriteString(domain + "\n"); err != nil { + return 0, 0, err + } + } + if err := fpVisit.Sync(); err != nil { + return 0, 0, err + } + return skip, tweak, nil +} + +func makeREADME(opts options, sths []ct.SignedTreeHead, skip, tweak int, now time.Time) (string, error) { + snapshotTime, err := readSnapshotTime(opts) + if err != nil { + return "", err + } + + noticeFile := opts.Directory + "/" + opts.noticeFile + notice, err := noticeReport(noticeFile) + if err != nil { + // TODO: start writing notice prints to a separate file in data/ + // by default, then make this a hard error. This needs to be + // done manually now by grepping for NOTICE in collect.stdout. + logger.Printf("WARNING: could not find notice file, skipping") + notice = "UNKNOWN" + } else { + // TODO: save notice file + } + + return fmt.Sprintf(`# ct-sans dataset + +Dataset assembled at %s. Contents: + + - README.md + - metadata.json + - metadata.sig + - sths.json + - sans.lst (see below) + - visit.lst (see below) + - notice.txt (see below) + +The signed [metadata file][] and tree heads were downloaded at %s. + +[metadata file]: https://groups.google.com/a/chromium.org/g/ct-policy/c/IdbrdAcDQto + +## sans.lst + +The result of downloading %d certificates from %d CT logs. +One SAN per line. These lines are sorted and de-duplicated. + +While downloading, %s certificates contained SANs that could not be parsed. +See %s for the exact details. + +## visit.lst + +A rewrite of the sans.lst dataset to be more suitable for visits: + + - Leading and trailing white space is removed + - Trailing dots are removed + - The prefixes "http://" and "https://" are removed + - Lines that contain non-printable ASCII characters are dropped. The + definition of a printable ascii character is '\t' and numerical values + 32-126. In total, %d lines were dropped. + - Make wildcard domains into normal domains by removing the first two bytes. + In total, %d lines were rewritten like this. + +`, now.Format(time.UnixDate), snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), notice, noticeFile, skip, tweak), nil +} + +func fileSize(name string) (string, error) { + fi, err := os.Stat(name) + if err != nil { + return "", err + } + size := fmt.Sprintf("%.1f GiB", float64(fi.Size())/float64((1024*1024*1024))) + if fi.Size() < 1024*1024*1024 { + size = fmt.Sprintf("%.1f MiB", float64(fi.Size())/float64((1024*1024))) + } + return size, nil +} + +func noticeReport(path string) (string, error) { + fp, err := os.OpenFile(path, os.O_RDONLY, 0644) + if err != nil { + return "", err + } + defer fp.Close() + + scanner := bufio.NewScanner(fp) + num := 0 + for scanner.Scan() { + _ = scanner.Text() + num += 1 + } + return fmt.Sprintf("%d", num), nil +} + +func numCertificates(sths []ct.SignedTreeHead) (sum uint64) { + for _, sth := range sths { + sum += sth.TreeSize + } + return +} + +func readSnapshotTime(opts options) (time.Time, error) { + b, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataTimestampFile)) + if err != nil { + return time.Time{}, err + } + num, err := strconv.ParseInt(string(b), 10, 64) + if err != nil { + return time.Time{}, err + } + return time.Unix(num, 0), nil } diff --git a/main.go b/main.go index 3d9b3a7..88d6b90 100644 --- a/main.go +++ b/main.go @@ -30,14 +30,17 @@ Usage: Collect SANs with regards to the current snapshot ct-sans assemble [Options...] - Assemble a dataset manifest and print a command that combines, - sorts, and removes duplicate SANs that were collected. + Assemble dataset from the collected data (requires GNU sort) Help: ct-sans [-h] [--help] -Options: +Snapshot options: + + -d, --directory: The ct-sans working directory (Default: "data") + +Collect options: -d, --directory: The ct-sans working directory (Default: "data") -w, --workers: Max number of parallel download workers per log (Default: 2). @@ -46,23 +49,42 @@ Options: -a, --http-agent: HTTP agent to use in all request (Default: "git.cs.kau.se/rasmoste/ct-sans") -m, --metrics: How often to emit metrics to stderr (Default: 16s) +Asemble options: + + -d, --directory: The ct-sans working directory (Default: "data") + -b, --buffer-size: Max memory to use in Gigabytes (Default: 1) + -t, --temp-dir: Temporary on-disk storage directory (Default: /tmp/ct-sans) + -p, --parallel: Number of CPUs to use (Default: 1) + ` type options struct { - Directory string + // Common options + Directory string + + // Collect options WorkersPerLog uint64 PersistSize uint64 BatchSize uint64 HTTPAgent string MetricsInterval time.Duration + // Assemble options + BufferSize uint64 + TempDir string + Parallel uint64 + + // Constants logDirectory string + archiveDirectory string metadataFile string metadataSignatureFile string metadataTimestampFile string sthFile string stateFile string sansFile string + visitFile string + noticeFile string } func main() { @@ -77,12 +99,17 @@ func main() { fs := ctflag.NewFlagSet() opts := options{} ctflag.String(&fs, &opts.Directory, "directory", "d", "data") + ctflag.Uint64(&fs, &opts.WorkersPerLog, "workers", "w", 2) ctflag.Uint64(&fs, &opts.PersistSize, "batch-disk", "k", 256) ctflag.Uint64(&fs, &opts.BatchSize, "batch-req", "q", 128) ctflag.String(&fs, &opts.HTTPAgent, "http-agent", "a", "git.cs.kau.se/rasmoste/ct-sans") ctflag.Duration(&fs, &opts.MetricsInterval, "metrics", "m", 16*time.Second) + ctflag.Uint64(&fs, &opts.BufferSize, "buffer-size", "b", 1) + ctflag.String(&fs, &opts.TempDir, "temp-dir", "t", "/tmp/ct-sans") + ctflag.Uint64(&fs, &opts.Parallel, "parallel", "p", 1) + // Parse command-line options and hardcode additional values if err := ctflag.Parse(fs, os.Args[2:]); err != nil { if err == flag.ErrHelp { @@ -94,12 +121,15 @@ func main() { os.Exit(1) } opts.logDirectory = opts.Directory + "/" + "logs" + opts.archiveDirectory = opts.Directory + "/" + "archive" opts.metadataFile = "metadata.json" opts.metadataSignatureFile = "metadata.sig" opts.metadataTimestampFile = "metadata.timestamp" opts.sthFile = "sth.json" opts.stateFile = "th.json" opts.sansFile = "sans.lst" + opts.visitFile = "visit.lst" + opts.noticeFile = "notice.txt" // Hand-over to the respective subcommands var err error -- cgit v1.2.3