aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorRasmus Dahlberg <rasmus@rgdd.se>2024-08-22 19:42:20 +0200
committerRasmus Dahlberg <rasmus@rgdd.se>2024-08-22 19:45:31 +0200
commit942e8d24d75eed2473576c22c5894a72ea1f9b30 (patch)
treee825c33fb25931126d2367a97ac7e67e35d207ec /scripts
parentfc6866e229ba8b4278ed1dce79351b40481e6c2a (diff)
Add another digest script
$ ./scripts/digest2.py -i /home/rgdd/Downloads/2023-04-03-ct-sans/au-mel/*.stdout /home/rgdd/Downloads/2023-04-03-ct-sans/us-nyc/*.stdout /home/rgdd/Downloads/2023-04-03-ct-sans/de-fra/*.stdout 2>&1 digest2.py:26 INFO: found 3330 onions via Onion-Location digest2.py:27 INFO: found 3077 via HTTP headers digest2.py:28 INFO: found 281 via HTML tags digest2.py:29 INFO: found 28 via both HTTP and HTML
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/digest2.py161
1 files changed, 161 insertions, 0 deletions
diff --git a/scripts/digest2.py b/scripts/digest2.py
new file mode 100755
index 0000000..d01293b
--- /dev/null
+++ b/scripts/digest2.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+
+__program_description ='''
+A script that digests the output of onion-grab. Meant to be used for sorting
+out the number of onion addresses and how they were discovered via O-L. It
+is digest "2" because this was added after discovering a redirect bug. So,
+this output gives a better view of how common HTTP and HTML config really is.
+'''
+
+import sys
+import argparse
+import logging
+
+log = logging.getLogger(__name__)
+
+import base64
+import hashlib
+
+def main(args):
+ input_lines = []
+ for inputFile in args.input_file:
+ with open(inputFile) as fp:
+ input_lines += [ line for line in fp ]
+
+ numOnion, numHTTP, numHTML = parse_input(input_lines)
+ log.info(f'found {numOnion} onions via Onion-Location')
+ log.info(f'found {numHTTP} via HTTP headers')
+ log.info(f'found {numHTML} via HTML tags')
+ log.info(f'found {numHTTP + numHTML - numOnion} via both HTTP and HTML')
+
+def parse_input(lines):
+ onion2method = {}
+ for line in lines:
+ try:
+ line = line[:len(line)-1]
+ for result in parse_line(line):
+ addr, isHTTP = result
+ addr = trim_onion(trimPath(trimScheme(addr)))
+ onion2method.setdefault(addr, {})
+ if isHTTP:
+ onion2method[addr]["http"] = True
+ else:
+ onion2method[addr]["html"] = True
+ except Exception as e:
+ log.debug(f'"{line}": {e}')
+
+ numOnion = len(onion2method)
+ numHTTP = 0
+ numHTML = 0
+ for onion in onion2method:
+ d = onion2method[onion]
+ if "http" in d:
+ numHTTP += 1
+ if "html" in d:
+ numHTML += 1
+ return numOnion, numHTTP, numHTML
+
+def parse_line(line):
+ '''
+ Line format is:
+
+ <domain> http=[value] html=[value]
+
+ where at least one of http or html should have a value. Note: there has
+ been no vetting of what <value> is. Outputs domain and a list of values,
+ and bolean values indicating if the domain used an HTTP and/or HTML config.
+ '''
+ ret = []
+
+ s = line.split(" ")
+ if len(s) != 3:
+ raise Exception(f'invalid line split')
+
+ domain = s[0]
+ http2onion = s[1]
+ html2onion = s[2]
+
+ s = http2onion.split("=")
+ if len(s) < 2:
+ raise Exception(f'invalid http split')
+ if len(s[1]) > 0:
+ ret += [ (s[1], True) ]
+
+ s = html2onion.split("=")
+ if len(s) < 2:
+ raise Exception(f'invalid html split')
+ if len(s[1]) > 0:
+ ret += [ (s[1], False) ]
+
+ return ret
+
+def trimScheme(url):
+ '''
+ Removes required http:// or https:// scheme from url.
+ '''
+ for scheme in [ "http://", "https://" ]:
+ if url.startswith(scheme):
+ return url[len(scheme):]
+
+ raise Exception(f'no http or https scheme')
+
+def trimPath(url):
+ '''
+ Trims the path off from the url.
+ '''
+ return url.split("/")[0]
+
+def trim_onion(host):
+ '''
+ Parses host as a v3 onion address, ports and subdomains are trimmed.
+ '''
+ s = host.split(":")
+ if len(s) > 2:
+ raise Exception(f'invalid host name')
+ if len(s) == 2:
+ port = int(s[1])
+ if port < 1 or port > 2**16 - 1:
+ raise Exception(f'port number not in [1, {2**16 - 1}]')
+
+ domain = s[0]
+ s = domain.split(".")
+ if len(s) < 2:
+ raise Exception(f'too few labels to be an onion address')
+ if s[len(s)-1] != "onion":
+ raise Exception(f'the final DNS label must be "onion"')
+ if len(s[len(s)-2]) != 56:
+ raise Exception(f'the DNS label before ".onion" must be 56 bytes')
+
+ assert_v3(base64.b32decode(s[len(s)-2].upper().encode('UTF-8')))
+ return ".".join(s[len(s)-2:])
+
+def assert_v3(blob):
+ '''
+ https://gitweb.torproject.org/torspec.git/tree/rend-spec-v3.txt#n2240
+ '''
+ pubkey = blob[:32]
+ checksum = blob[32:34]
+ version = blob[34:35]
+ if version[0] != 3:
+ raise Exception(f'invalid version: {version[0]}')
+
+ h = hashlib.sha3_256()
+ h.update(b'.onion checksum')
+ h.update(pubkey)
+ h.update(version)
+ c = h.digest()
+ if checksum[0] != c[0] or checksum[1] != c[1]:
+ raise Exception(f'invalid checksum')
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description=__program_description)
+ parser.add_argument("-v", "--verbosity", type=str, default="info",
+ help="logging verbosity, select from debug, info, warning, error, and critical")
+ parser.add_argument('-i','--input-file', nargs='+', required=True,
+ help='input file with collected data')
+
+ args = parser.parse_args()
+ logging.basicConfig(level=logging.__dict__[args.verbosity.upper()],
+ format='%(filename)s:%(lineno)d %(levelname)s: %(message)s')
+
+ sys.exit(main(args))