#!/usr/bin/env python3 __program_description =''' A script that digests the output of onion-grab. Meant to be used for sorting out the number of onion addresses and how they were discovered via O-L. It is digest "2" because this was added after discovering a redirect bug. So, this output gives a better view of how common HTTP and HTML config really is. ''' import sys import argparse import logging log = logging.getLogger(__name__) import base64 import hashlib def main(args): input_lines = [] for inputFile in args.input_file: with open(inputFile) as fp: input_lines += [ line for line in fp ] numOnion, numHTTP, numHTML = parse_input(input_lines) log.info(f'found {numOnion} onions via Onion-Location') log.info(f'found {numHTTP} via HTTP headers') log.info(f'found {numHTML} via HTML tags') log.info(f'found {numHTTP + numHTML - numOnion} via both HTTP and HTML') def parse_input(lines): onion2method = {} for line in lines: try: line = line[:len(line)-1] for result in parse_line(line): addr, isHTTP = result addr = trim_onion(trimPath(trimScheme(addr))) onion2method.setdefault(addr, {}) if isHTTP: onion2method[addr]["http"] = True else: onion2method[addr]["html"] = True except Exception as e: log.debug(f'"{line}": {e}') numOnion = len(onion2method) numHTTP = 0 numHTML = 0 for onion in onion2method: d = onion2method[onion] if "http" in d: numHTTP += 1 if "html" in d: numHTML += 1 return numOnion, numHTTP, numHTML def parse_line(line): ''' Line format is: http=[value] html=[value] where at least one of http or html should have a value. Note: there has been no vetting of what is. Outputs domain and a list of values, and bolean values indicating if the domain used an HTTP and/or HTML config. ''' ret = [] s = line.split(" ") if len(s) != 3: raise Exception(f'invalid line split') domain = s[0] http2onion = s[1] html2onion = s[2] s = http2onion.split("=") if len(s) < 2: raise Exception(f'invalid http split') if len(s[1]) > 0: ret += [ (s[1], True) ] s = html2onion.split("=") if len(s) < 2: raise Exception(f'invalid html split') if len(s[1]) > 0: ret += [ (s[1], False) ] return ret def trimScheme(url): ''' Removes required http:// or https:// scheme from url. ''' for scheme in [ "http://", "https://" ]: if url.startswith(scheme): return url[len(scheme):] raise Exception(f'no http or https scheme') def trimPath(url): ''' Trims the path off from the url. ''' return url.split("/")[0] def trim_onion(host): ''' Parses host as a v3 onion address, ports and subdomains are trimmed. ''' s = host.split(":") if len(s) > 2: raise Exception(f'invalid host name') if len(s) == 2: port = int(s[1]) if port < 1 or port > 2**16 - 1: raise Exception(f'port number not in [1, {2**16 - 1}]') domain = s[0] s = domain.split(".") if len(s) < 2: raise Exception(f'too few labels to be an onion address') if s[len(s)-1] != "onion": raise Exception(f'the final DNS label must be "onion"') if len(s[len(s)-2]) != 56: raise Exception(f'the DNS label before ".onion" must be 56 bytes') assert_v3(base64.b32decode(s[len(s)-2].upper().encode('UTF-8'))) return ".".join(s[len(s)-2:]) def assert_v3(blob): ''' https://gitweb.torproject.org/torspec.git/tree/rend-spec-v3.txt#n2240 ''' pubkey = blob[:32] checksum = blob[32:34] version = blob[34:35] if version[0] != 3: raise Exception(f'invalid version: {version[0]}') h = hashlib.sha3_256() h.update(b'.onion checksum') h.update(pubkey) h.update(version) c = h.digest() if checksum[0] != c[0] or checksum[1] != c[1]: raise Exception(f'invalid checksum') if __name__ == "__main__": parser = argparse.ArgumentParser(description=__program_description) parser.add_argument("-v", "--verbosity", type=str, default="info", help="logging verbosity, select from debug, info, warning, error, and critical") parser.add_argument('-i','--input-file', nargs='+', required=True, help='input file with collected data') args = parser.parse_args() logging.basicConfig(level=logging.__dict__[args.verbosity.upper()], format='%(filename)s:%(lineno)d %(levelname)s: %(message)s') sys.exit(main(args))