From 38df474cf30b0b1d077c8d53b353a859af99c7d6 Mon Sep 17 00:00:00 2001 From: Rasmus Dahlberg Date: Thu, 23 Mar 2023 10:05:48 +0100 Subject: Drop sanitize of SANs Less complex, we will just pass lines to Go's HTTP GET as is. --- cmd_assemble.go | 110 ++++++------------------------------------ internal/sanitize/sanitize.go | 85 -------------------------------- main.go | 2 + 3 files changed, 18 insertions(+), 179 deletions(-) delete mode 100644 internal/sanitize/sanitize.go diff --git a/cmd_assemble.go b/cmd_assemble.go index f084193..ae6af50 100644 --- a/cmd_assemble.go +++ b/cmd_assemble.go @@ -10,10 +10,8 @@ import ( "os" "os/exec" "strconv" - "strings" "time" - "git.cs.kau.se/rasmoste/ct-sans/internal/sanitize" ct "github.com/google/certificate-transparency-go" "gitlab.torproject.org/rgdd/ct/pkg/metadata" ) @@ -66,31 +64,8 @@ func assemble(opts options) error { } logger.Printf("INFO: created %s (%s)", sansFile, size) - logger.Printf("INFO: sanitizing SANs to visit") - visitFile := fmt.Sprintf("%s/%s", archiveDir, opts.visitFile) - skip, tweak, err := sanitizeSANs(sansFile, visitFile) - if err != nil { - return err - } - logger.Printf("INFO: ruled out %d lines while sanitizing domains", skip) - logger.Printf("INFO: tweaked %d domains with \"*.\" prefixes", tweak) - logger.Printf("INFO: created %s", visitFile) - - logger.Printf("INFO: de-duplicating %s with GNU sort", visitFile) - wipFile := fmt.Sprintf("%s.tmp", visitFile) - if err := dedup(opts, wipFile, []string{visitFile}); err != nil { - return err - } - if err := os.Rename(wipFile, visitFile); err != nil { - return err - } - if size, err = fileSize(visitFile); err != nil { - return err - } - logger.Printf("INFO: %s is down to %s", visitFile, size) - logger.Printf("INFO: adding README") - readme, err := makeREADME(opts, sths, skip, tweak, now) + readme, err := makeREADME(opts, sths, now) if err != nil { return err } @@ -115,7 +90,7 @@ func assemble(opts options) error { if err != nil { return err } - if err := os.WriteFile(fmt.Sprintf("%s/sths.json", archiveDir), sthsBytes, 0644); err != nil { + if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.sthsFile), sthsBytes, 0644); err != nil { return err } @@ -141,45 +116,7 @@ func dedup(opts options, outputFile string, inputFiles []string) error { return nil } -func sanitizeSANs(sansFile, visitFile string) (int, int, error) { - fpSANs, err := os.OpenFile(sansFile, os.O_RDONLY, 0644) - if err != nil { - return 0, 0, err - } - defer fpSANs.Close() - scanner := bufio.NewScanner(fpSANs) - max := 128 * 1024 * 1024 - buf := make([]byte, 0, max) - scanner.Buffer(buf, max) - - fpVisit, err := os.OpenFile(visitFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) - if err != nil { - return 0, 0, err - } - defer fpVisit.Close() - - skip := 0 - tweak := 0 - for scanner.Scan() { - domain, err := sanitize.SanitizeDNSName(scanner.Text()) - if err != nil { - skip += 1 - } - if strings.HasPrefix(domain, "*.") { - tweak += 1 - domain = domain[2:] - } - if _, err := fpVisit.WriteString(domain + "\n"); err != nil { - return 0, 0, err - } - } - if err := fpVisit.Sync(); err != nil { - return 0, 0, err - } - return skip, tweak, nil -} - -func makeREADME(opts options, sths []ct.SignedTreeHead, skip, tweak int, now time.Time) (string, error) { +func makeREADME(opts options, sths []ct.SignedTreeHead, now time.Time) (string, error) { snapshotTime, err := readSnapshotTime(opts) if err != nil { return "", err @@ -202,39 +139,24 @@ func makeREADME(opts options, sths []ct.SignedTreeHead, skip, tweak int, now tim Dataset assembled at %s. Contents: - README.md - - metadata.json - - metadata.sig - - sths.json - - sans.lst (see below) - - visit.lst (see below) - - notice.txt (see below) + - %s + - %s + - %s + - %s + - %s -The signed [metadata file][] and tree heads were downloaded at %s. +The signed [metadata file][] and tree heads were downloaded at +%s. [metadata file]: https://groups.google.com/a/chromium.org/g/ct-policy/c/IdbrdAcDQto -## sans.lst - -The result of downloading %d certificates from %d CT logs. -One SAN per line. These lines are sorted and de-duplicated. - -While downloading, %s certificates contained SANs that could not be parsed. -See %s for the exact details. - -## visit.lst - -A rewrite of the sans.lst dataset to be more suitable for visits: - - - Leading and trailing white space is removed - - Trailing dots are removed - - The prefixes "http://" and "https://" are removed - - Lines that contain non-printable ASCII characters are dropped. The - definition of a printable ascii character is '\t' and numerical values - 32-126. In total, %d lines were dropped. - - Make wildcard domains into normal domains by removing the first two bytes. - In total, %d lines were rewritten like this. +In total, %d certificates were downloaded from %d CT logs; +%s certificates contained SANs that could not be parsed. +For more information about these errors, see %s. -`, now.Format(time.UnixDate), snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), notice, noticeFile, skip, tweak), nil +The SANs data set is sorted and de-duplicated, one SAN per line. +`, now.Format(time.UnixDate), opts.metadataFile, opts.metadataSignatureFile, opts.sthsFile, opts.sansFile, opts.noticeFile, + snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), notice, noticeFile), nil } func fileSize(name string) (string, error) { diff --git a/internal/sanitize/sanitize.go b/internal/sanitize/sanitize.go deleted file mode 100644 index 6fcdf09..0000000 --- a/internal/sanitize/sanitize.go +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2016 Opsmate, Inc. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License, v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// -// This software is distributed WITHOUT A WARRANTY OF ANY KIND. -// See the Mozilla Public License for details. -// -// From: https://github.com/SSLMate/certspotter/blob/master/identifiers.go -// Minor tweaks to get a SanitizeDNSName function for ASCII names only. -package sanitize - -import ( - "fmt" - "strings" -) - -// Try to canonicalize/sanitize the DNS name: -// -// 1. Trim leading and trailing whitespace -// 2. Trim trailing dots -// 3. Trim http:// and https:// prefix -// 4. Convert to lower case -// 5. Error if the DNS labels are not composed of ASCII characters 32-126 or "\t" -// -// Please note that the above is not necessarily a good filter for real CT -// monitoring (this is why we're not applying it in the collect stage). It is -// also not a good filter for getting rid of non-domain names like "funny str". -// It is however simple to understand ("printable ascii chars"), and should be -// good enough for the purpose of assembling a SANs data set from CT logs. -func SanitizeDNSName(value string) (string, error) { - value = trimHttpPrefixString(strings.ToLower(trimTrailingDots(strings.TrimSpace(value)))) - if !isASCIIString([]byte(value)) { - return "", fmt.Errorf("not an ascii string: %x", []byte(value)) - } - - labels := strings.Split(value, ".") - for _, label := range labels { - if !isSaneDNSLabel(label) { - return "", fmt.Errorf("process label %x", []byte(label)) - } - } - return strings.Join(labels, "."), nil -} - -func trimTrailingDots(value string) string { - length := len(value) - for length > 0 && value[length-1] == '.' { - length-- - } - return value[0:length] -} - -func trimHttpPrefixString(value string) string { - if strings.HasPrefix(value, "http://") { - return value[7:] - } else if strings.HasPrefix(value, "https://") { - return value[8:] - } else { - return value - } -} - -func isASCIIString(value []byte) bool { - for _, b := range value { - if b > 127 { - return false - } - } - return true -} - -func isSaneDNSLabel(label string) bool { - for _, ch := range label { - if !isSaneDNSLabelChar(ch) { - return false - } - } - return true -} - -func isSaneDNSLabelChar(ch rune) bool { - return ch == '\t' || (ch >= 32 && ch <= 126) -} diff --git a/main.go b/main.go index d72b347..5c25fa4 100644 --- a/main.go +++ b/main.go @@ -81,6 +81,7 @@ type options struct { metadataSignatureFile string metadataTimestampFile string sthFile string + sthsFile string stateFile string sansFile string visitFile string @@ -126,6 +127,7 @@ func main() { opts.metadataSignatureFile = "metadata.sig" opts.metadataTimestampFile = "metadata.timestamp" opts.sthFile = "sth.json" + opts.sthsFile = "sths.json" opts.stateFile = "th.json" opts.sansFile = "sans.lst" opts.visitFile = "visit.lst" -- cgit v1.2.3