aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRasmus Dahlberg <rasmus@rgdd.se>2023-03-20 19:52:21 +0100
committerRasmus Dahlberg <rasmus@rgdd.se>2023-03-20 19:54:36 +0100
commit2bee2104c84628a68ef7124a1beefa4e1f98369e (patch)
tree1b54458cf370786cc1a62babd2c3129ccfc045d0
parent8e8cd8214d579e26e05dcb44fcd53d909e23879c (diff)
Add drafty assemble command
-rw-r--r--README.md2
-rw-r--r--cmd_assemble.go282
-rw-r--r--main.go38
3 files changed, 316 insertions, 6 deletions
diff --git a/README.md b/README.md
index 078275d..7941e20 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,8 @@ UNIX tool `sort`. For the exact commands and an associated dataset manifest:
sort -u ...
...
+ sort -Vuo sans.lst --buffer-size=1024K --temporary-directory=/tmp/t --parallel=2 a.lst b.lst
+
Note that you may need to tweak the number of CPUs, available memory, and
temporary disk space to be suitable for your own system.
diff --git a/cmd_assemble.go b/cmd_assemble.go
index 246ba6e..f084193 100644
--- a/cmd_assemble.go
+++ b/cmd_assemble.go
@@ -1,7 +1,285 @@
package main
-import "fmt"
+import (
+ "bufio"
+ "bytes"
+ "encoding/json"
+ "errors"
+ "fmt"
+ logger "log"
+ "os"
+ "os/exec"
+ "strconv"
+ "strings"
+ "time"
+
+ "git.cs.kau.se/rasmoste/ct-sans/internal/sanitize"
+ ct "github.com/google/certificate-transparency-go"
+ "gitlab.torproject.org/rgdd/ct/pkg/metadata"
+)
func assemble(opts options) error {
- return fmt.Errorf("TODO")
+ now := time.Now()
+ metadataBytes, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataFile))
+ if err != nil {
+ return err
+ }
+ var md metadata.Metadata
+ if err := json.Unmarshal(metadataBytes, &md); err != nil {
+ return err
+ }
+ var files []string
+ var sths []ct.SignedTreeHead
+ for _, log := range logs(md) {
+ id, _ := log.Key.ID()
+ th, err := readState(opts, id[:])
+ if err != nil {
+ return err
+ }
+ sth, err := readSnapshot(opts, id[:])
+ if err != nil {
+ return err
+ }
+ if uint64(th.TreeSize) != sth.TreeSize {
+ return fmt.Errorf("%s: at tree size %d, want %d", *log.Description, th.TreeSize, sth.TreeSize)
+ }
+ if th.RootHash != sth.SHA256RootHash {
+ return fmt.Errorf("%s: root hash mismatch")
+ }
+
+ files = append(files, fmt.Sprintf("%s/%x/%s", opts.logDirectory, id[:], opts.sansFile))
+ sths = append(sths, sth)
+ }
+
+ logger.Printf("INFO: merging and de-duplicating %d input files with GNU sort", len(files))
+ archiveDir := fmt.Sprintf("%s/%s-ct-sans", opts.archiveDirectory, now.Format("2006-01-02"))
+ if err := os.MkdirAll(archiveDir, os.ModePerm); err != nil {
+ return err
+ }
+ sansFile := fmt.Sprintf("%s/%s", archiveDir, opts.sansFile)
+ if err := dedup(opts, sansFile, files); err != nil {
+ return err
+ }
+ size, err := fileSize(sansFile)
+ if err != nil {
+ return err
+ }
+ logger.Printf("INFO: created %s (%s)", sansFile, size)
+
+ logger.Printf("INFO: sanitizing SANs to visit")
+ visitFile := fmt.Sprintf("%s/%s", archiveDir, opts.visitFile)
+ skip, tweak, err := sanitizeSANs(sansFile, visitFile)
+ if err != nil {
+ return err
+ }
+ logger.Printf("INFO: ruled out %d lines while sanitizing domains", skip)
+ logger.Printf("INFO: tweaked %d domains with \"*.\" prefixes", tweak)
+ logger.Printf("INFO: created %s", visitFile)
+
+ logger.Printf("INFO: de-duplicating %s with GNU sort", visitFile)
+ wipFile := fmt.Sprintf("%s.tmp", visitFile)
+ if err := dedup(opts, wipFile, []string{visitFile}); err != nil {
+ return err
+ }
+ if err := os.Rename(wipFile, visitFile); err != nil {
+ return err
+ }
+ if size, err = fileSize(visitFile); err != nil {
+ return err
+ }
+ logger.Printf("INFO: %s is down to %s", visitFile, size)
+
+ logger.Printf("INFO: adding README")
+ readme, err := makeREADME(opts, sths, skip, tweak, now)
+ if err != nil {
+ return err
+ }
+ if err := os.WriteFile(fmt.Sprintf("%s/README.md", archiveDir), []byte(readme), 0644); err != nil {
+ return err
+ }
+
+ logger.Printf("INFO: adding signed metadata file")
+ sigBytes, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataSignatureFile))
+ if err != nil {
+ return err
+ }
+ if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.metadataFile), metadataBytes, 0644); err != nil {
+ return err
+ }
+ if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.metadataSignatureFile), sigBytes, 0644); err != nil {
+ return err
+ }
+
+ logger.Printf("INFO: adding signed tree heads")
+ sthsBytes, err := json.MarshalIndent(sths, "", "\t")
+ if err != nil {
+ return err
+ }
+ if err := os.WriteFile(fmt.Sprintf("%s/sths.json", archiveDir), sthsBytes, 0644); err != nil {
+ return err
+ }
+
+ logger.Printf("INFO: uncompressed dataset available in %s", archiveDir)
+ return nil
+}
+
+func dedup(opts options, outputFile string, inputFiles []string) error {
+ cmd := exec.Command("sort", append([]string{
+ "-Vuo", outputFile,
+ "--buffer-size", fmt.Sprintf("%dG", opts.BufferSize),
+ "--temporary-directory", fmt.Sprintf("%s", opts.TempDir),
+ "--parallel", fmt.Sprintf("%d", opts.Parallel),
+ }, inputFiles...)...)
+ if errors.Is(cmd.Err, exec.ErrDot) {
+ cmd.Err = nil
+ }
+ stderr := bytes.NewBuffer(nil)
+ cmd.Stderr = stderr
+ if _, err := cmd.Output(); err != nil {
+ return fmt.Errorf("%s", string(stderr.Bytes()))
+ }
+ return nil
+}
+
+func sanitizeSANs(sansFile, visitFile string) (int, int, error) {
+ fpSANs, err := os.OpenFile(sansFile, os.O_RDONLY, 0644)
+ if err != nil {
+ return 0, 0, err
+ }
+ defer fpSANs.Close()
+ scanner := bufio.NewScanner(fpSANs)
+ max := 128 * 1024 * 1024
+ buf := make([]byte, 0, max)
+ scanner.Buffer(buf, max)
+
+ fpVisit, err := os.OpenFile(visitFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
+ if err != nil {
+ return 0, 0, err
+ }
+ defer fpVisit.Close()
+
+ skip := 0
+ tweak := 0
+ for scanner.Scan() {
+ domain, err := sanitize.SanitizeDNSName(scanner.Text())
+ if err != nil {
+ skip += 1
+ }
+ if strings.HasPrefix(domain, "*.") {
+ tweak += 1
+ domain = domain[2:]
+ }
+ if _, err := fpVisit.WriteString(domain + "\n"); err != nil {
+ return 0, 0, err
+ }
+ }
+ if err := fpVisit.Sync(); err != nil {
+ return 0, 0, err
+ }
+ return skip, tweak, nil
+}
+
+func makeREADME(opts options, sths []ct.SignedTreeHead, skip, tweak int, now time.Time) (string, error) {
+ snapshotTime, err := readSnapshotTime(opts)
+ if err != nil {
+ return "", err
+ }
+
+ noticeFile := opts.Directory + "/" + opts.noticeFile
+ notice, err := noticeReport(noticeFile)
+ if err != nil {
+ // TODO: start writing notice prints to a separate file in data/
+ // by default, then make this a hard error. This needs to be
+ // done manually now by grepping for NOTICE in collect.stdout.
+ logger.Printf("WARNING: could not find notice file, skipping")
+ notice = "UNKNOWN"
+ } else {
+ // TODO: save notice file
+ }
+
+ return fmt.Sprintf(`# ct-sans dataset
+
+Dataset assembled at %s. Contents:
+
+ - README.md
+ - metadata.json
+ - metadata.sig
+ - sths.json
+ - sans.lst (see below)
+ - visit.lst (see below)
+ - notice.txt (see below)
+
+The signed [metadata file][] and tree heads were downloaded at %s.
+
+[metadata file]: https://groups.google.com/a/chromium.org/g/ct-policy/c/IdbrdAcDQto
+
+## sans.lst
+
+The result of downloading %d certificates from %d CT logs.
+One SAN per line. These lines are sorted and de-duplicated.
+
+While downloading, %s certificates contained SANs that could not be parsed.
+See %s for the exact details.
+
+## visit.lst
+
+A rewrite of the sans.lst dataset to be more suitable for visits:
+
+ - Leading and trailing white space is removed
+ - Trailing dots are removed
+ - The prefixes "http://" and "https://" are removed
+ - Lines that contain non-printable ASCII characters are dropped. The
+ definition of a printable ascii character is '\t' and numerical values
+ 32-126. In total, %d lines were dropped.
+ - Make wildcard domains into normal domains by removing the first two bytes.
+ In total, %d lines were rewritten like this.
+
+`, now.Format(time.UnixDate), snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), notice, noticeFile, skip, tweak), nil
+}
+
+func fileSize(name string) (string, error) {
+ fi, err := os.Stat(name)
+ if err != nil {
+ return "", err
+ }
+ size := fmt.Sprintf("%.1f GiB", float64(fi.Size())/float64((1024*1024*1024)))
+ if fi.Size() < 1024*1024*1024 {
+ size = fmt.Sprintf("%.1f MiB", float64(fi.Size())/float64((1024*1024)))
+ }
+ return size, nil
+}
+
+func noticeReport(path string) (string, error) {
+ fp, err := os.OpenFile(path, os.O_RDONLY, 0644)
+ if err != nil {
+ return "", err
+ }
+ defer fp.Close()
+
+ scanner := bufio.NewScanner(fp)
+ num := 0
+ for scanner.Scan() {
+ _ = scanner.Text()
+ num += 1
+ }
+ return fmt.Sprintf("%d", num), nil
+}
+
+func numCertificates(sths []ct.SignedTreeHead) (sum uint64) {
+ for _, sth := range sths {
+ sum += sth.TreeSize
+ }
+ return
+}
+
+func readSnapshotTime(opts options) (time.Time, error) {
+ b, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataTimestampFile))
+ if err != nil {
+ return time.Time{}, err
+ }
+ num, err := strconv.ParseInt(string(b), 10, 64)
+ if err != nil {
+ return time.Time{}, err
+ }
+ return time.Unix(num, 0), nil
}
diff --git a/main.go b/main.go
index 3d9b3a7..88d6b90 100644
--- a/main.go
+++ b/main.go
@@ -30,14 +30,17 @@ Usage:
Collect SANs with regards to the current snapshot
ct-sans assemble [Options...]
- Assemble a dataset manifest and print a command that combines,
- sorts, and removes duplicate SANs that were collected.
+ Assemble dataset from the collected data (requires GNU sort)
Help:
ct-sans [-h] [--help]
-Options:
+Snapshot options:
+
+ -d, --directory: The ct-sans working directory (Default: "data")
+
+Collect options:
-d, --directory: The ct-sans working directory (Default: "data")
-w, --workers: Max number of parallel download workers per log (Default: 2).
@@ -46,23 +49,42 @@ Options:
-a, --http-agent: HTTP agent to use in all request (Default: "git.cs.kau.se/rasmoste/ct-sans")
-m, --metrics: How often to emit metrics to stderr (Default: 16s)
+Asemble options:
+
+ -d, --directory: The ct-sans working directory (Default: "data")
+ -b, --buffer-size: Max memory to use in Gigabytes (Default: 1)
+ -t, --temp-dir: Temporary on-disk storage directory (Default: /tmp/ct-sans)
+ -p, --parallel: Number of CPUs to use (Default: 1)
+
`
type options struct {
- Directory string
+ // Common options
+ Directory string
+
+ // Collect options
WorkersPerLog uint64
PersistSize uint64
BatchSize uint64
HTTPAgent string
MetricsInterval time.Duration
+ // Assemble options
+ BufferSize uint64
+ TempDir string
+ Parallel uint64
+
+ // Constants
logDirectory string
+ archiveDirectory string
metadataFile string
metadataSignatureFile string
metadataTimestampFile string
sthFile string
stateFile string
sansFile string
+ visitFile string
+ noticeFile string
}
func main() {
@@ -77,12 +99,17 @@ func main() {
fs := ctflag.NewFlagSet()
opts := options{}
ctflag.String(&fs, &opts.Directory, "directory", "d", "data")
+
ctflag.Uint64(&fs, &opts.WorkersPerLog, "workers", "w", 2)
ctflag.Uint64(&fs, &opts.PersistSize, "batch-disk", "k", 256)
ctflag.Uint64(&fs, &opts.BatchSize, "batch-req", "q", 128)
ctflag.String(&fs, &opts.HTTPAgent, "http-agent", "a", "git.cs.kau.se/rasmoste/ct-sans")
ctflag.Duration(&fs, &opts.MetricsInterval, "metrics", "m", 16*time.Second)
+ ctflag.Uint64(&fs, &opts.BufferSize, "buffer-size", "b", 1)
+ ctflag.String(&fs, &opts.TempDir, "temp-dir", "t", "/tmp/ct-sans")
+ ctflag.Uint64(&fs, &opts.Parallel, "parallel", "p", 1)
+
// Parse command-line options and hardcode additional values
if err := ctflag.Parse(fs, os.Args[2:]); err != nil {
if err == flag.ErrHelp {
@@ -94,12 +121,15 @@ func main() {
os.Exit(1)
}
opts.logDirectory = opts.Directory + "/" + "logs"
+ opts.archiveDirectory = opts.Directory + "/" + "archive"
opts.metadataFile = "metadata.json"
opts.metadataSignatureFile = "metadata.sig"
opts.metadataTimestampFile = "metadata.timestamp"
opts.sthFile = "sth.json"
opts.stateFile = "th.json"
opts.sansFile = "sans.lst"
+ opts.visitFile = "visit.lst"
+ opts.noticeFile = "notice.txt"
// Hand-over to the respective subcommands
var err error