From 942e8d24d75eed2473576c22c5894a72ea1f9b30 Mon Sep 17 00:00:00 2001
From: Rasmus Dahlberg <rasmus@rgdd.se>
Date: Thu, 22 Aug 2024 19:42:20 +0200
Subject: Add another digest script

$ ./scripts/digest2.py -i /home/rgdd/Downloads/2023-04-03-ct-sans/au-mel/*.stdout /home/rgdd/Downloads/2023-04-03-ct-sans/us-nyc/*.stdout /home/rgdd/Downloads/2023-04-03-ct-sans/de-fra/*.stdout 2>&1
digest2.py:26 INFO: found 3330 onions via Onion-Location
digest2.py:27 INFO: found 3077 via HTTP headers
digest2.py:28 INFO: found 281 via HTML tags
digest2.py:29 INFO: found 28 via both HTTP and HTML
---
 scripts/digest2.py | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100755 scripts/digest2.py

diff --git a/scripts/digest2.py b/scripts/digest2.py
new file mode 100755
index 0000000..d01293b
--- /dev/null
+++ b/scripts/digest2.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+
+__program_description ='''
+A script that digests the output of onion-grab.  Meant to be used for sorting
+out the number of onion addresses and how they were discovered via O-L.  It
+is digest "2" because this was added after discovering a redirect bug.   So,
+this output gives a better view of how common HTTP and HTML config really is.
+'''
+
+import sys
+import argparse
+import logging
+
+log = logging.getLogger(__name__)
+
+import base64
+import hashlib
+
+def main(args):
+    input_lines = []
+    for inputFile in args.input_file:
+        with open(inputFile) as fp:
+            input_lines += [ line for line in fp ]
+
+    numOnion, numHTTP, numHTML = parse_input(input_lines)
+    log.info(f'found {numOnion} onions via Onion-Location')
+    log.info(f'found {numHTTP} via HTTP headers')
+    log.info(f'found {numHTML} via HTML tags')
+    log.info(f'found {numHTTP + numHTML - numOnion} via both HTTP and HTML')
+
+def parse_input(lines):
+    onion2method = {}
+    for line in lines:
+        try:
+            line = line[:len(line)-1]
+            for result in parse_line(line):
+                addr, isHTTP = result
+                addr = trim_onion(trimPath(trimScheme(addr)))
+                onion2method.setdefault(addr, {})
+                if isHTTP:
+                    onion2method[addr]["http"] = True
+                else:
+                    onion2method[addr]["html"] = True
+        except Exception as e:
+            log.debug(f'"{line}": {e}')
+
+    numOnion = len(onion2method)
+    numHTTP = 0
+    numHTML = 0
+    for onion in onion2method:
+        d = onion2method[onion]
+        if "http" in d:
+            numHTTP += 1
+        if "html" in d:
+            numHTML += 1
+    return numOnion, numHTTP, numHTML
+
+def parse_line(line):
+    '''
+    Line format is:
+
+    <domain> http=[value] html=[value]
+
+    where at least one of http or html should have a value.  Note: there has
+    been no vetting of what <value> is.  Outputs domain and a list of values,
+    and bolean values indicating if the domain used an HTTP and/or HTML config.
+    '''
+    ret = []
+
+    s = line.split(" ")
+    if len(s) != 3:
+        raise Exception(f'invalid line split')
+
+    domain = s[0]
+    http2onion = s[1]
+    html2onion = s[2]
+
+    s = http2onion.split("=")
+    if len(s) < 2:
+        raise Exception(f'invalid http split')
+    if len(s[1]) > 0:
+        ret += [ (s[1], True) ]
+
+    s = html2onion.split("=")
+    if len(s) < 2:
+        raise Exception(f'invalid html split')
+    if len(s[1]) > 0:
+        ret += [ (s[1], False) ]
+
+    return ret
+
+def trimScheme(url):
+    '''
+    Removes required http:// or https:// scheme from url.
+    '''
+    for scheme in [ "http://", "https://" ]:
+        if url.startswith(scheme):
+            return url[len(scheme):]
+
+    raise Exception(f'no http or https scheme')
+
+def trimPath(url):
+    '''
+    Trims the path off from the url.
+    '''
+    return url.split("/")[0]
+
+def trim_onion(host):
+    '''
+    Parses host as a v3 onion address, ports and subdomains are trimmed.
+    '''
+    s = host.split(":")
+    if len(s) > 2:
+        raise Exception(f'invalid host name')
+    if len(s) == 2:
+        port = int(s[1])
+        if port < 1 or port > 2**16 - 1:
+            raise Exception(f'port number not in [1, {2**16 - 1}]')
+
+    domain = s[0]
+    s = domain.split(".")
+    if len(s) < 2:
+        raise Exception(f'too few labels to be an onion address')
+    if s[len(s)-1] != "onion":
+        raise Exception(f'the final DNS label must be "onion"')
+    if len(s[len(s)-2]) != 56:
+        raise Exception(f'the DNS label before ".onion" must be 56 bytes')
+
+    assert_v3(base64.b32decode(s[len(s)-2].upper().encode('UTF-8')))
+    return ".".join(s[len(s)-2:])
+
+def assert_v3(blob):
+    '''
+    https://gitweb.torproject.org/torspec.git/tree/rend-spec-v3.txt#n2240
+    '''
+    pubkey = blob[:32]
+    checksum = blob[32:34]
+    version = blob[34:35]
+    if version[0] != 3:
+        raise Exception(f'invalid version: {version[0]}')
+
+    h = hashlib.sha3_256()
+    h.update(b'.onion checksum')
+    h.update(pubkey)
+    h.update(version)
+    c = h.digest()
+    if checksum[0] != c[0] or checksum[1] != c[1]:
+        raise Exception(f'invalid checksum')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__program_description)
+    parser.add_argument("-v", "--verbosity", type=str, default="info",
+            help="logging verbosity, select from debug, info, warning, error, and critical")
+    parser.add_argument('-i','--input-file', nargs='+', required=True,
+            help='input file with collected data')
+
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.__dict__[args.verbosity.upper()],
+            format='%(filename)s:%(lineno)d %(levelname)s: %(message)s')
+
+    sys.exit(main(args))
-- 
cgit v1.2.3