From 38df474cf30b0b1d077c8d53b353a859af99c7d6 Mon Sep 17 00:00:00 2001
From: Rasmus Dahlberg <rasmus@rgdd.se>
Date: Thu, 23 Mar 2023 10:05:48 +0100
Subject: Drop sanitize of SANs

Less complex, we will just pass lines to Go's HTTP GET as is.
---
 cmd_assemble.go               | 110 ++++++------------------------------------
 internal/sanitize/sanitize.go |  85 --------------------------------
 main.go                       |   2 +
 3 files changed, 18 insertions(+), 179 deletions(-)
 delete mode 100644 internal/sanitize/sanitize.go

diff --git a/cmd_assemble.go b/cmd_assemble.go
index f084193..ae6af50 100644
--- a/cmd_assemble.go
+++ b/cmd_assemble.go
@@ -10,10 +10,8 @@ import (
 	"os"
 	"os/exec"
 	"strconv"
-	"strings"
 	"time"
 
-	"git.cs.kau.se/rasmoste/ct-sans/internal/sanitize"
 	ct "github.com/google/certificate-transparency-go"
 	"gitlab.torproject.org/rgdd/ct/pkg/metadata"
 )
@@ -66,31 +64,8 @@ func assemble(opts options) error {
 	}
 	logger.Printf("INFO: created %s (%s)", sansFile, size)
 
-	logger.Printf("INFO: sanitizing SANs to visit")
-	visitFile := fmt.Sprintf("%s/%s", archiveDir, opts.visitFile)
-	skip, tweak, err := sanitizeSANs(sansFile, visitFile)
-	if err != nil {
-		return err
-	}
-	logger.Printf("INFO: ruled out %d lines while sanitizing domains", skip)
-	logger.Printf("INFO: tweaked %d domains with \"*.\" prefixes", tweak)
-	logger.Printf("INFO: created %s", visitFile)
-
-	logger.Printf("INFO: de-duplicating %s with GNU sort", visitFile)
-	wipFile := fmt.Sprintf("%s.tmp", visitFile)
-	if err := dedup(opts, wipFile, []string{visitFile}); err != nil {
-		return err
-	}
-	if err := os.Rename(wipFile, visitFile); err != nil {
-		return err
-	}
-	if size, err = fileSize(visitFile); err != nil {
-		return err
-	}
-	logger.Printf("INFO: %s is down to %s", visitFile, size)
-
 	logger.Printf("INFO: adding README")
-	readme, err := makeREADME(opts, sths, skip, tweak, now)
+	readme, err := makeREADME(opts, sths, now)
 	if err != nil {
 		return err
 	}
@@ -115,7 +90,7 @@ func assemble(opts options) error {
 	if err != nil {
 		return err
 	}
-	if err := os.WriteFile(fmt.Sprintf("%s/sths.json", archiveDir), sthsBytes, 0644); err != nil {
+	if err := os.WriteFile(fmt.Sprintf("%s/%s", archiveDir, opts.sthsFile), sthsBytes, 0644); err != nil {
 		return err
 	}
 
@@ -141,45 +116,7 @@ func dedup(opts options, outputFile string, inputFiles []string) error {
 	return nil
 }
 
-func sanitizeSANs(sansFile, visitFile string) (int, int, error) {
-	fpSANs, err := os.OpenFile(sansFile, os.O_RDONLY, 0644)
-	if err != nil {
-		return 0, 0, err
-	}
-	defer fpSANs.Close()
-	scanner := bufio.NewScanner(fpSANs)
-	max := 128 * 1024 * 1024
-	buf := make([]byte, 0, max)
-	scanner.Buffer(buf, max)
-
-	fpVisit, err := os.OpenFile(visitFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
-	if err != nil {
-		return 0, 0, err
-	}
-	defer fpVisit.Close()
-
-	skip := 0
-	tweak := 0
-	for scanner.Scan() {
-		domain, err := sanitize.SanitizeDNSName(scanner.Text())
-		if err != nil {
-			skip += 1
-		}
-		if strings.HasPrefix(domain, "*.") {
-			tweak += 1
-			domain = domain[2:]
-		}
-		if _, err := fpVisit.WriteString(domain + "\n"); err != nil {
-			return 0, 0, err
-		}
-	}
-	if err := fpVisit.Sync(); err != nil {
-		return 0, 0, err
-	}
-	return skip, tweak, nil
-}
-
-func makeREADME(opts options, sths []ct.SignedTreeHead, skip, tweak int, now time.Time) (string, error) {
+func makeREADME(opts options, sths []ct.SignedTreeHead, now time.Time) (string, error) {
 	snapshotTime, err := readSnapshotTime(opts)
 	if err != nil {
 		return "", err
@@ -202,39 +139,24 @@ func makeREADME(opts options, sths []ct.SignedTreeHead, skip, tweak int, now tim
 Dataset assembled at %s.  Contents:
 
   - README.md
-  - metadata.json
-  - metadata.sig
-  - sths.json
-  - sans.lst (see below)
-  - visit.lst (see below)
-  - notice.txt (see below)
+  - %s
+  - %s
+  - %s
+  - %s
+  - %s
 
-The signed [metadata file][] and tree heads were downloaded at %s.
+The signed [metadata file][] and tree heads were downloaded at
+%s.
 
 [metadata file]: https://groups.google.com/a/chromium.org/g/ct-policy/c/IdbrdAcDQto
 
-## sans.lst
-
-The result of downloading %d certificates from %d CT logs.
-One SAN per line.  These lines are sorted and de-duplicated.
-
-While downloading, %s certificates contained SANs that could not be parsed.
-See %s for the exact details.
-
-## visit.lst
-
-A rewrite of the sans.lst dataset to be more suitable for visits:
-
-  - Leading and trailing white space is removed
-  - Trailing dots are removed
-  - The prefixes "http://" and "https://" are removed
-  - Lines that contain non-printable ASCII characters are dropped.  The
-    definition of a printable ascii character is '\t' and numerical values
-    32-126.  In total, %d lines were dropped.
-  - Make wildcard domains into normal domains by removing the first two bytes.
-    In total, %d lines were rewritten like this.
+In total, %d certificates were downloaded from %d CT logs;
+%s certificates contained SANs that could not be parsed.
+For more information about these errors, see %s.
 
-`, now.Format(time.UnixDate), snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), notice, noticeFile, skip, tweak), nil
+The SANs data set is sorted and de-duplicated, one SAN per line.
+`, now.Format(time.UnixDate), opts.metadataFile, opts.metadataSignatureFile, opts.sthsFile, opts.sansFile, opts.noticeFile,
+		snapshotTime.Format(time.UnixDate), numCertificates(sths), len(sths), notice, noticeFile), nil
 }
 
 func fileSize(name string) (string, error) {
diff --git a/internal/sanitize/sanitize.go b/internal/sanitize/sanitize.go
deleted file mode 100644
index 6fcdf09..0000000
--- a/internal/sanitize/sanitize.go
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2016 Opsmate, Inc.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License, v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-//
-// This software is distributed WITHOUT A WARRANTY OF ANY KIND.
-// See the Mozilla Public License for details.
-//
-// From: https://github.com/SSLMate/certspotter/blob/master/identifiers.go
-// Minor tweaks to get a SanitizeDNSName function for ASCII names only.
-package sanitize
-
-import (
-	"fmt"
-	"strings"
-)
-
-// Try to canonicalize/sanitize the DNS name:
-//
-//  1. Trim leading and trailing whitespace
-//  2. Trim trailing dots
-//  3. Trim http:// and https:// prefix
-//  4. Convert to lower case
-//  5. Error if the DNS labels are not composed of ASCII characters 32-126 or "\t"
-//
-// Please note that the above is not necessarily a good filter for real CT
-// monitoring (this is why we're not applying it in the collect stage).  It is
-// also not a good filter for getting rid of non-domain names like "funny str".
-// It is however simple to understand ("printable ascii chars"), and should be
-// good enough for the purpose of assembling a SANs data set from CT logs.
-func SanitizeDNSName(value string) (string, error) {
-	value = trimHttpPrefixString(strings.ToLower(trimTrailingDots(strings.TrimSpace(value))))
-	if !isASCIIString([]byte(value)) {
-		return "", fmt.Errorf("not an ascii string: %x", []byte(value))
-	}
-
-	labels := strings.Split(value, ".")
-	for _, label := range labels {
-		if !isSaneDNSLabel(label) {
-			return "", fmt.Errorf("process label %x", []byte(label))
-		}
-	}
-	return strings.Join(labels, "."), nil
-}
-
-func trimTrailingDots(value string) string {
-	length := len(value)
-	for length > 0 && value[length-1] == '.' {
-		length--
-	}
-	return value[0:length]
-}
-
-func trimHttpPrefixString(value string) string {
-	if strings.HasPrefix(value, "http://") {
-		return value[7:]
-	} else if strings.HasPrefix(value, "https://") {
-		return value[8:]
-	} else {
-		return value
-	}
-}
-
-func isASCIIString(value []byte) bool {
-	for _, b := range value {
-		if b > 127 {
-			return false
-		}
-	}
-	return true
-}
-
-func isSaneDNSLabel(label string) bool {
-	for _, ch := range label {
-		if !isSaneDNSLabelChar(ch) {
-			return false
-		}
-	}
-	return true
-}
-
-func isSaneDNSLabelChar(ch rune) bool {
-	return ch == '\t' || (ch >= 32 && ch <= 126)
-}
diff --git a/main.go b/main.go
index d72b347..5c25fa4 100644
--- a/main.go
+++ b/main.go
@@ -81,6 +81,7 @@ type options struct {
 	metadataSignatureFile string
 	metadataTimestampFile string
 	sthFile               string
+	sthsFile              string
 	stateFile             string
 	sansFile              string
 	visitFile             string
@@ -126,6 +127,7 @@ func main() {
 	opts.metadataSignatureFile = "metadata.sig"
 	opts.metadataTimestampFile = "metadata.timestamp"
 	opts.sthFile = "sth.json"
+	opts.sthsFile = "sths.json"
 	opts.stateFile = "th.json"
 	opts.sansFile = "sans.lst"
 	opts.visitFile = "visit.lst"
-- 
cgit v1.2.3