#!/usr/bin/env python3 __program_description =''' A script that digests the output of onion-grab. Meant to be used for sorting out which domains set what Onion-Location values; as well as which domains set the same onion addresses when pruned to two labels. See also stderr prints. ''' import sys import argparse import logging log = logging.getLogger(__name__) import base64 import hashlib def main(args): input_lines = [] for inputFile in args.input_file: with open(inputFile) as fp: input_lines += [ line for line in fp ] onion2domains, domain2onions, numHTTP, numHTML = parse_input(input_lines) log.info(f'found {numHTTP} HTTP headers with Onion-Location') log.info(f'found {numHTML} HTML meta attributes with Onion-Location') log.info(f'found {len(domain2onions)} unqiue domain names that set Onion-Location') log.info(f'found {len(onion2domains)} unique two-label onion addresses in the process') log.info(f'storing domains with valid Onion-Location configurations in {args.output_domains}') with open(args.output_domains, "w") as fp: for d, o in domain2onions.items(): fp.write(f'{d} {" ".join(o)}\n') log.info(f'storing two-label onion addresses that domains referenced in {args.output_onions}') with open(args.output_onions, "w") as fp: for o, d in onion2domains.items(): fp.write(f'{o} {" ".join(d)}\n') def parse_input(lines): ''' Outputs the following two dictionaries and integers: - onion2domain: key is a unique two-label onion address; the value is a list of domain names that referenced it with Onion-Location. - domain2onion: key is the domain name that configured Onion-Location; the value is a non-modified list of onion URLs, i.e., including an http or https scheme, subdomains (if any), port (if any), and path (if any). - number of valid HTTP headers (at most one per non-unique domain) - number of valid HTML meta attributes (at most one per non-unique domain) ''' onion2domain = {} domain2onion = {} numHTTP = 0 numHTML = 0 for line in lines: try: line = line[:len(line)-1] domain, onions, isHTTP, isHTML = parse_line(line) for onion in onions: addr = trim_onion(trimPath(trimScheme(onion))) onion2domain.setdefault(addr, []) onion2domain[addr] += [ domain ] domain2onion.setdefault(domain, []) domain2onion[domain] += [ onion ] if isHTTP: numHTTP += 1 if isHTML: numHTML += 1 except Exception as e: log.debug(f'"{line}": {e}') return deduplicate_values(onion2domain), deduplicate_values(domain2onion), numHTTP, numHTML def parse_line(line): ''' Line format is: http=[value] html=[value] where at least one of http or html should have a value. Note: there has been no vetting of what is. Outputs domain and a list of values, and bolean values indicating if the domain used an HTTP and/or HTML config. ''' s = line.split(" ") if len(s) != 3: raise Exception(f'invalid line split') domain = s[0] http2onion = s[1] html2onion = s[2] onions = [] isHTTP = False s = http2onion.split("=") if len(s) < 2: raise Exception(f'invalid http split') if len(s[1]) > 1: onions += [ "=".join(s[1:]) ] isHTTP = True isHTML = False s = html2onion.split("=") if len(s) < 2: raise Exception(f'invalid html split') if len(s[1]) > 1: onions += [ "=".join(s[1:]) ] isHTML = True if isHTTP and isHTML: log.debug(f'{domain} sets both http header and html attribute') return domain, onions, isHTTP, isHTML def trimScheme(url): ''' Removes required http:// or https:// scheme from url. ''' for scheme in [ "http://", "https://" ]: if url.startswith(scheme): return url[len(scheme):] raise Exception(f'no http or https scheme') def trimPath(url): ''' Trims the path off from the url. ''' return url.split("/")[0] def trim_onion(host): ''' Parses host as a v3 onion address, ports and subdomains are trimmed. ''' s = host.split(":") if len(s) > 2: raise Exception(f'invalid host name') if len(s) == 2: port = int(s[1]) if port < 1 or port > 2**16 - 1: raise Exception(f'port number not in [1, {2**16 - 1}]') domain = s[0] s = domain.split(".") if len(s) < 2: raise Exception(f'too few labels to be an onion address') if s[len(s)-1] != "onion": raise Exception(f'the final DNS label must be "onion"') if len(s[len(s)-2]) != 56: raise Exception(f'the DNS label before ".onion" must be 56 bytes') assert_v3(base64.b32decode(s[len(s)-2].upper().encode('UTF-8'))) return ".".join(s[len(s)-2:]) def assert_v3(blob): ''' https://gitweb.torproject.org/torspec.git/tree/rend-spec-v3.txt#n2240 ''' pubkey = blob[:32] checksum = blob[32:34] version = blob[34:35] if version[0] != 3: raise Exception(f'invalid version: {version[0]}') h = hashlib.sha3_256() h.update(b'.onion checksum') h.update(pubkey) h.update(version) c = h.digest() if checksum[0] != c[0] or checksum[1] != c[1]: raise Exception(f'invalid checksum') def deduplicate_values(key2values): for key in key2values: key2values[key] = sorted(list(set(key2values[key]))) return key2values if __name__ == "__main__": parser = argparse.ArgumentParser(description=__program_description) parser.add_argument("-v", "--verbosity", type=str, default="info", help="logging verbosity, select from debug, info, warning, error, and critical") parser.add_argument("-o", "--output-onions", required=False, type=str, default="onions.txt", help="path to output file keyed on two-label onion addresses") parser.add_argument("-d", "--output-domains", required=False, type=str, default="domains.txt", help="path to output file keyed on domains with Onion-Location") parser.add_argument('-i','--input-file', nargs='+', required=True, help='input file with collected data') args = parser.parse_args() logging.basicConfig(level=logging.__dict__[args.verbosity.upper()], format='%(filename)s:%(lineno)d %(levelname)s: %(message)s') sys.exit(main(args))