package main import ( "bufio" "bytes" "encoding/json" "errors" "fmt" logger "log" "os" "os/exec" "strconv" "strings" "time" "git.cs.kau.se/rasmoste/ct-sans/internal/sanitize" ct "github.com/google/certificate-transparency-go" "gitlab.torproject.org/rgdd/ct/pkg/metadata" ) func assemble(opts options) error { now := time.Now() metadataBytes, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataFile)) if err != nil { return err } var md metadata.Metadata if err := json.Unmarshal(metadataBytes, &md); err != nil { return err } var files []string var sths []ct.SignedTreeHead for _, log := range logs(md) { id, _ := log.Key.ID() th, err := readState(opts, id[:]) if err != nil { return err } sth, err := readSnapshot(opts, id[:]) if err != nil { return err } if uint64(th.TreeSize) != sth.TreeSize { return fmt.Errorf("%s: at tree size %d, want %d", *log.Description, th.TreeSize, sth.TreeSize) } if th.RootHash != sth.SHA256RootHash { return fmt.Errorf("%s: root hash mismatch") } files = append(files, fmt.Sprintf("%s/%x/%s", opts.logDirectory, id[:], opts.sansFile)) sths = append(sths, sth) } logger.Printf("INFO: merging and de-duplicating %d input files with GNU sort", len(files)) archiveDir := fmt.Sprintf("%s/%s-ct-sans", opts.archiveDirectory, now.Format("2006-01-02")) if err := os.MkdirAll(archiveDir, os.ModePerm); err != nil { return err } sansFile := fmt.Sprintf("%s/%s", archiveDir, opts.sansFile) if err := dedup(opts, sansFile, files); err != nil { return err } size, err := fileSize(sansFile) if err != nil { return err } logger.Printf("INFO: created %s (%s)", sansFile, size) logger.Printf("INFO: sanitizing SANs to visit") visitFile := fmt.Sprintf("%s/%s", archiveDir, opts.visitFile) skip, tweak, err := sanitizeSANs(sansFile, visitFile) if err != nil { return err } logger.Printf("INFO: ruled out %d lines while sanitizing domains", skip) logger.Printf("INFO: tweaked %d domains with \"*.\" prefixes", tweak) logger.Printf("INFO: created %s", visitFile) logger.Printf("INFO: de-duplicating %s with GNU sort", visitFile) wipFile := fmt.Sprintf("%s.tmp", visitFile) if err := dedup(opts, wipFile, []string{visitFile}); err != nil { return err } if err := os.Rename(wipFile, visitFile); err != nil { return err } if size, err = fileSize(visitFile); err != nil { return err } logger.Printf("INFO: %s is down to %s", visitFile, size) logger.Printf("INFO: adding README") readme, err := makeREADME(opts, sths, skip, tweak, now) if err != nil { return err } if err := os.WriteFile(fmt.Sprintf("%s/README.md", archiveDir), []byte(readme), 0644); err != nil { return err } logger.Printf("INFO: adding signed metadata file") sigBytes, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataSignatureFile)) if err != nil { return err } if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.metadataFile), metadataBytes, 0644); err != nil { return err } if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.metadataSignatureFile), sigBytes, 0644); err != nil { return err } logger.Printf("INFO: adding signed tree heads") sthsBytes, err := json.MarshalIndent(sths, "", "\t") if err != nil { return err } if err := os.WriteFile(fmt.Sprintf("%s/sths.json", archiveDir), sthsBytes, 0644); err != nil { return err } logger.Printf("INFO: uncompressed dataset available in %s", archiveDir) return nil } func dedup(opts options, outputFile string, inputFiles []string) error { cmd := exec.Command("sort", append([]string{ "-Vuo", outputFile, "--buffer-size", fmt.Sprintf("%dG", opts.BufferSize), "--temporary-directory", fmt.Sprintf("%s", opts.TempDir), "--parallel", fmt.Sprintf("%d", opts.Parallel), }, inputFiles...)...) if errors.Is(cmd.Err, exec.ErrDot) { cmd.Err = nil } stderr := bytes.NewBuffer(nil) cmd.Stderr = stderr if _, err := cmd.Output(); err != nil { return fmt.Errorf("%s", string(stderr.Bytes())) } return nil } func sanitizeSANs(sansFile, visitFile string) (int, int, error) { fpSANs, err := os.OpenFile(sansFile, os.O_RDONLY, 0644) if err != nil { return 0, 0, err } defer fpSANs.Close() scanner := bufio.NewScanner(fpSANs) max := 128 * 1024 * 1024 buf := make([]byte, 0, max) scanner.Buffer(buf, max) fpVisit, err := os.OpenFile(visitFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) if err != nil { return 0, 0, err } defer fpVisit.Close() skip := 0 tweak := 0 for scanner.Scan() { domain, err := sanitize.SanitizeDNSName(scanner.Text()) if err != nil { skip += 1 } if strings.HasPrefix(domain, "*.") { tweak += 1 domain = domain[2:] } if _, err := fpVisit.WriteString(domain + "\n"); err != nil { return 0, 0, err } } if err := fpVisit.Sync(); err != nil { return 0, 0, err } return skip, tweak, nil } func makeREADME(opts options, sths []ct.SignedTreeHead, skip, tweak int, now time.Time) (string, error) { snapshotTime, err := readSnapshotTime(opts) if err != nil { return "", err } noticeFile := opts.Directory + "/" + opts.noticeFile notice, err := noticeReport(noticeFile) if err != nil { // TODO: start writing notice prints to a separate file in data/ // by default, then make this a hard error. This needs to be // done manually now by grepping for NOTICE in collect.stdout. logger.Printf("WARNING: could not find notice file, skipping") notice = "UNKNOWN" } else { // TODO: save notice file } return fmt.Sprintf(`# ct-sans dataset Dataset assembled at %s. Contents: - README.md - metadata.json - metadata.sig - sths.json - sans.lst (see below) - visit.lst (see below) - notice.txt (see below) The signed [metadata file][] and tree heads were downloaded at %s. [metadata file]: https://groups.google.com/a/chromium.org/g/ct-policy/c/IdbrdAcDQto ## sans.lst The result of downloading %d certificates from %d CT logs. One SAN per line. These lines are sorted and de-duplicated. While downloading, %s certificates contained SANs that could not be parsed. See %s for the exact details. ## visit.lst A rewrite of the sans.lst dataset to be more suitable for visits: - Leading and trailing white space is removed - Trailing dots are removed - The prefixes "http://" and "https://" are removed - Lines that contain non-printable ASCII characters are dropped. The definition of a printable ascii character is '\t' and numerical values 32-126. In total, %d lines were dropped. - Make wildcard domains into normal domains by removing the first two bytes. In total, %d lines were rewritten like this. `, now.Format(time.UnixDate), snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), notice, noticeFile, skip, tweak), nil } func fileSize(name string) (string, error) { fi, err := os.Stat(name) if err != nil { return "", err } size := fmt.Sprintf("%.1f GiB", float64(fi.Size())/float64((1024*1024*1024))) if fi.Size() < 1024*1024*1024 { size = fmt.Sprintf("%.1f MiB", float64(fi.Size())/float64((1024*1024))) } return size, nil } func noticeReport(path string) (string, error) { fp, err := os.OpenFile(path, os.O_RDONLY, 0644) if err != nil { return "", err } defer fp.Close() scanner := bufio.NewScanner(fp) num := 0 for scanner.Scan() { _ = scanner.Text() num += 1 } return fmt.Sprintf("%d", num), nil } func numCertificates(sths []ct.SignedTreeHead) (sum uint64) { for _, sth := range sths { sum += sth.TreeSize } return } func readSnapshotTime(opts options) (time.Time, error) { b, err := os.ReadFile(fmt.Sprintf("%s/%s", opts.Directory, opts.metadataTimestampFile)) if err != nil { return time.Time{}, err } num, err := strconv.ParseInt(string(b), 10, 64) if err != nil { return time.Time{}, err } return time.Unix(num, 0), nil }