From 385cc92bc91e1a6c3724085c060e76bf40c13ed3 Mon Sep 17 00:00:00 2001 From: Rasmus Dahlberg Date: Tue, 15 Oct 2024 16:08:16 +0200 Subject: Import PhD thesis --- summary/src/abbr.tex | 19 + summary/src/abstract.tex | 36 + summary/src/acknowledgements.tex | 40 + summary/src/cat/.gitignore | 9 + summary/src/cat/img/aws.pdf | Bin 0 -> 150359 bytes summary/src/cat/img/bt10.pdf | Bin 0 -> 154201 bytes summary/src/cat/img/bt100.pdf | Bin 0 -> 157181 bytes summary/src/cat/img/bt1000.pdf | Bin 0 -> 148806 bytes summary/src/cat/img/df_nodef.pdf | Bin 0 -> 118767 bytes summary/src/cat/img/df_wt.pdf | Bin 0 -> 105303 bytes summary/src/cat/img/df_wtfpad.pdf | Bin 0 -> 126269 bytes summary/src/cat/img/dns__classifier-idea.pdf | Bin 0 -> 86270 bytes summary/src/cat/img/dns__timing-dist.pdf | Bin 0 -> 98297 bytes summary/src/cat/img/dynaflow_config1.pdf | Bin 0 -> 115968 bytes summary/src/cat/img/dynaflow_config2.pdf | Bin 0 -> 127080 bytes summary/src/cat/img/dynaflow_nodef.pdf | Bin 0 -> 123795 bytes summary/src/cat/img/factor-fnp.pdf | Bin 0 -> 123720 bytes summary/src/cat/img/factor-recall.pdf | Bin 0 -> 145211 bytes summary/src/cat/img/probfp.pdf | Bin 0 -> 132759 bytes summary/src/cat/img/setting-oracle.pdf | Bin 0 -> 1316415 bytes summary/src/cat/img/setting.pdf | Bin 0 -> 1021741 bytes summary/src/cat/img/timeuntilvisited.pdf | Bin 0 -> 84191 bytes summary/src/cat/img/wang_csbuflo.pdf | Bin 0 -> 111918 bytes summary/src/cat/img/wang_nodef.pdf | Bin 0 -> 118619 bytes summary/src/cat/img/wang_tamaraw.pdf | Bin 0 -> 116397 bytes summary/src/cat/main.tex | 70 + summary/src/cat/src/abstract.tex | 25 + summary/src/cat/src/ack.tex | 9 + summary/src/cat/src/background.tex | 211 ++ summary/src/cat/src/bayes.tex | 96 + summary/src/cat/src/conclusions.tex | 25 + summary/src/cat/src/discussion.tex | 153 ++ summary/src/cat/src/intro.tex | 122 ++ summary/src/cat/src/lessons.tex | 47 + summary/src/cat/src/main.tex | 121 ++ summary/src/cat/src/oracles.tex | 126 ++ summary/src/cat/src/othersources.tex | 112 + summary/src/cat/src/ref-min.bib | 837 ++++++++ summary/src/cat/src/related.tex | 64 + summary/src/cat/src/sim.tex | 131 ++ summary/src/cat/src/sources.tex | 204 ++ summary/src/cat/src/wf.tex | 181 ++ summary/src/ctga/.gitignore | 9 + summary/src/ctga/img/design.pdf | Bin 0 -> 33094 bytes summary/src/ctga/img/parser.tex | 66 + summary/src/ctga/img/perf-netfpga.pdf | Bin 0 -> 15936 bytes summary/src/ctga/img/perf-xdp.pdf | Bin 0 -> 15950 bytes summary/src/ctga/img/pl.pdf | Bin 0 -> 13695 bytes summary/src/ctga/img/ps.pdf | Bin 0 -> 15881 bytes summary/src/ctga/img/related.tex | 37 + summary/src/ctga/img/wcov-goo.pdf | Bin 0 -> 19025 bytes summary/src/ctga/img/wcov-nor.pdf | Bin 0 -> 17408 bytes summary/src/ctga/main.tex | 70 + summary/src/ctga/src/abstract.tex | 16 + summary/src/ctga/src/acknowledgments.tex | 5 + summary/src/ctga/src/background.tex | 90 + summary/src/ctga/src/conclusion.tex | 23 + summary/src/ctga/src/design.tex | 129 ++ summary/src/ctga/src/discussion.tex | 126 ++ summary/src/ctga/src/implementation.tex | 82 + summary/src/ctga/src/introduction.tex | 93 + summary/src/ctga/src/measurements.tex | 85 + summary/src/ctga/src/ref.bib | 573 +++++ summary/src/ctga/src/related.tex | 67 + summary/src/ctor/.gitignore | 9 + summary/src/ctor/img/design-full.pdf | Bin 0 -> 62338 bytes summary/src/ctor/img/design-incremental.pdf | Bin 0 -> 56192 bytes summary/src/ctor/main.tex | 72 + summary/src/ctor/src/abstract.tex | 30 + summary/src/ctor/src/acknowledgements.tex | 7 + summary/src/ctor/src/adversary.tex | 76 + summary/src/ctor/src/analysis.tex | 173 ++ summary/src/ctor/src/appendix.tex | 117 ++ summary/src/ctor/src/background.tex | 150 ++ summary/src/ctor/src/conclusion.tex | 49 + summary/src/ctor/src/cross-logging.tex | 101 + summary/src/ctor/src/design.tex | 377 ++++ summary/src/ctor/src/introduction.tex | 183 ++ summary/src/ctor/src/performance.tex | 142 ++ summary/src/ctor/src/privacy.tex | 48 + summary/src/ctor/src/ref.bib | 536 +++++ summary/src/ctor/src/related.tex | 80 + summary/src/introduction/img/contribs.pdf | Bin 0 -> 91306 bytes summary/src/introduction/img/contribs.svg | 2213 ++++++++++++++++++++ summary/src/introduction/img/ct.pdf | Bin 0 -> 38256 bytes summary/src/introduction/img/ct.svg | 1346 ++++++++++++ summary/src/introduction/main.tex | 826 ++++++++ summary/src/introduction/refs.bib | 954 +++++++++ summary/src/lwm/.gitignore | 9 + summary/src/lwm/img/mt.tex | 28 + summary/src/lwm/img/overview.tex | 75 + summary/src/lwm/img/proofcom.pdf | Bin 0 -> 12595 bytes summary/src/lwm/img/proofgen.pdf | Bin 0 -> 14456 bytes summary/src/lwm/img/proofvf.pdf | Bin 0 -> 14022 bytes summary/src/lwm/img/snapshot.pdf | Bin 0 -> 11767 bytes summary/src/lwm/img/wildcard.tex | 22 + summary/src/lwm/main.tex | 54 + summary/src/lwm/src/abstract.tex | 21 + summary/src/lwm/src/acknowledgments.tex | 3 + summary/src/lwm/src/background.tex | 119 ++ summary/src/lwm/src/conclusion.tex | 15 + summary/src/lwm/src/evaluation.tex | 186 ++ summary/src/lwm/src/introduction.tex | 76 + summary/src/lwm/src/lwm.tex | 148 ++ summary/src/lwm/src/references.bib | 255 +++ summary/src/other.tex | 36 + summary/src/sammanfattning.tex | 41 + summary/src/sauteed/.gitignore | 9 + summary/src/sauteed/img/onion-location.pdf | Bin 0 -> 17583 bytes summary/src/sauteed/img/onion-search.pdf | Bin 0 -> 18261 bytes summary/src/sauteed/main.tex | 64 + summary/src/sauteed/src/abstract.tex | 22 + summary/src/sauteed/src/acks.tex | 9 + summary/src/sauteed/src/appendix.tex | 79 + summary/src/sauteed/src/conc.tex | 16 + summary/src/sauteed/src/intro.tex | 45 + summary/src/sauteed/src/preliminaries.tex | 25 + summary/src/sauteed/src/refs.bib | 325 +++ summary/src/sauteed/src/related.tex | 62 + summary/src/sauteed/src/sauteed.tex | 260 +++ summary/src/tlwo/.gitignore | 9 + summary/src/tlwo/img/.gitkeep | 1 + summary/src/tlwo/img/attack.pdf | Bin 0 -> 19745 bytes summary/src/tlwo/img/cached.pdf | Bin 0 -> 14759 bytes .../src/tlwo/img/plot_cache_entries-permissive.pdf | Bin 0 -> 13239 bytes summary/src/tlwo/img/plot_cache_entries-web.pdf | Bin 0 -> 12990 bytes .../src/tlwo/img/plot_cache_hits-permissive.pdf | Bin 0 -> 28384 bytes summary/src/tlwo/img/plot_cache_hits-web.pdf | Bin 0 -> 28479 bytes summary/src/tlwo/img/plot_lookups-permissive.pdf | Bin 0 -> 15707 bytes summary/src/tlwo/img/plot_lookups-web.pdf | Bin 0 -> 15988 bytes .../tlwo/img/plot_popularity_match-permissive.pdf | Bin 0 -> 12204 bytes summary/src/tlwo/img/plot_popularity_match-web.pdf | Bin 0 -> 12201 bytes .../tlwo/img/plot_preload_entries-permissive.pdf | Bin 0 -> 107993 bytes summary/src/tlwo/img/plot_preload_entries-web.pdf | Bin 0 -> 107839 bytes .../src/tlwo/img/plot_preload_hits-permissive.pdf | Bin 0 -> 40862 bytes summary/src/tlwo/img/plot_preload_hits-web.pdf | Bin 0 -> 40856 bytes .../src/tlwo/img/plot_preload_lists-permissive.pdf | Bin 0 -> 34103 bytes summary/src/tlwo/img/plot_preload_lists-web.pdf | Bin 0 -> 35606 bytes summary/src/tlwo/img/preload.pdf | Bin 0 -> 110579 bytes summary/src/tlwo/img/preload.svg | 1009 +++++++++ summary/src/tlwo/img/repeat-attack.pdf | Bin 0 -> 15385 bytes summary/src/tlwo/img/resolve.pdf | Bin 0 -> 16334 bytes summary/src/tlwo/img/setting.pdf | Bin 0 -> 214769 bytes summary/src/tlwo/img/uncached.pdf | Bin 0 -> 15740 bytes summary/src/tlwo/main.tex | 69 + summary/src/tlwo/src/abstract.tex | 25 + summary/src/tlwo/src/acknowledgements.tex | 20 + summary/src/tlwo/src/attack.tex | 247 +++ summary/src/tlwo/src/availability.tex | 19 + summary/src/tlwo/src/background.tex | 73 + summary/src/tlwo/src/conclusion.tex | 52 + summary/src/tlwo/src/introduction.tex | 151 ++ summary/src/tlwo/src/long.tex | 473 +++++ summary/src/tlwo/src/ref.bib | 352 ++++ summary/src/tlwo/src/related.tex | 174 ++ summary/src/tlwo/src/short.tex | 63 + summary/src/tlwo/src/tor-cache.tex | 167 ++ 157 files changed, 16506 insertions(+) create mode 100644 summary/src/abbr.tex create mode 100644 summary/src/abstract.tex create mode 100644 summary/src/acknowledgements.tex create mode 100644 summary/src/cat/.gitignore create mode 100644 summary/src/cat/img/aws.pdf create mode 100644 summary/src/cat/img/bt10.pdf create mode 100644 summary/src/cat/img/bt100.pdf create mode 100644 summary/src/cat/img/bt1000.pdf create mode 100644 summary/src/cat/img/df_nodef.pdf create mode 100644 summary/src/cat/img/df_wt.pdf create mode 100644 summary/src/cat/img/df_wtfpad.pdf create mode 100644 summary/src/cat/img/dns__classifier-idea.pdf create mode 100644 summary/src/cat/img/dns__timing-dist.pdf create mode 100644 summary/src/cat/img/dynaflow_config1.pdf create mode 100644 summary/src/cat/img/dynaflow_config2.pdf create mode 100644 summary/src/cat/img/dynaflow_nodef.pdf create mode 100644 summary/src/cat/img/factor-fnp.pdf create mode 100644 summary/src/cat/img/factor-recall.pdf create mode 100644 summary/src/cat/img/probfp.pdf create mode 100644 summary/src/cat/img/setting-oracle.pdf create mode 100644 summary/src/cat/img/setting.pdf create mode 100644 summary/src/cat/img/timeuntilvisited.pdf create mode 100644 summary/src/cat/img/wang_csbuflo.pdf create mode 100644 summary/src/cat/img/wang_nodef.pdf create mode 100644 summary/src/cat/img/wang_tamaraw.pdf create mode 100644 summary/src/cat/main.tex create mode 100644 summary/src/cat/src/abstract.tex create mode 100644 summary/src/cat/src/ack.tex create mode 100644 summary/src/cat/src/background.tex create mode 100644 summary/src/cat/src/bayes.tex create mode 100644 summary/src/cat/src/conclusions.tex create mode 100644 summary/src/cat/src/discussion.tex create mode 100644 summary/src/cat/src/intro.tex create mode 100644 summary/src/cat/src/lessons.tex create mode 100644 summary/src/cat/src/main.tex create mode 100644 summary/src/cat/src/oracles.tex create mode 100644 summary/src/cat/src/othersources.tex create mode 100644 summary/src/cat/src/ref-min.bib create mode 100644 summary/src/cat/src/related.tex create mode 100644 summary/src/cat/src/sim.tex create mode 100644 summary/src/cat/src/sources.tex create mode 100644 summary/src/cat/src/wf.tex create mode 100644 summary/src/ctga/.gitignore create mode 100644 summary/src/ctga/img/design.pdf create mode 100644 summary/src/ctga/img/parser.tex create mode 100644 summary/src/ctga/img/perf-netfpga.pdf create mode 100644 summary/src/ctga/img/perf-xdp.pdf create mode 100644 summary/src/ctga/img/pl.pdf create mode 100644 summary/src/ctga/img/ps.pdf create mode 100644 summary/src/ctga/img/related.tex create mode 100644 summary/src/ctga/img/wcov-goo.pdf create mode 100644 summary/src/ctga/img/wcov-nor.pdf create mode 100644 summary/src/ctga/main.tex create mode 100644 summary/src/ctga/src/abstract.tex create mode 100644 summary/src/ctga/src/acknowledgments.tex create mode 100644 summary/src/ctga/src/background.tex create mode 100644 summary/src/ctga/src/conclusion.tex create mode 100644 summary/src/ctga/src/design.tex create mode 100644 summary/src/ctga/src/discussion.tex create mode 100644 summary/src/ctga/src/implementation.tex create mode 100644 summary/src/ctga/src/introduction.tex create mode 100644 summary/src/ctga/src/measurements.tex create mode 100644 summary/src/ctga/src/ref.bib create mode 100644 summary/src/ctga/src/related.tex create mode 100644 summary/src/ctor/.gitignore create mode 100644 summary/src/ctor/img/design-full.pdf create mode 100644 summary/src/ctor/img/design-incremental.pdf create mode 100644 summary/src/ctor/main.tex create mode 100644 summary/src/ctor/src/abstract.tex create mode 100644 summary/src/ctor/src/acknowledgements.tex create mode 100644 summary/src/ctor/src/adversary.tex create mode 100644 summary/src/ctor/src/analysis.tex create mode 100644 summary/src/ctor/src/appendix.tex create mode 100644 summary/src/ctor/src/background.tex create mode 100644 summary/src/ctor/src/conclusion.tex create mode 100644 summary/src/ctor/src/cross-logging.tex create mode 100644 summary/src/ctor/src/design.tex create mode 100644 summary/src/ctor/src/introduction.tex create mode 100644 summary/src/ctor/src/performance.tex create mode 100644 summary/src/ctor/src/privacy.tex create mode 100644 summary/src/ctor/src/ref.bib create mode 100644 summary/src/ctor/src/related.tex create mode 100644 summary/src/introduction/img/contribs.pdf create mode 100644 summary/src/introduction/img/contribs.svg create mode 100644 summary/src/introduction/img/ct.pdf create mode 100644 summary/src/introduction/img/ct.svg create mode 100644 summary/src/introduction/main.tex create mode 100644 summary/src/introduction/refs.bib create mode 100644 summary/src/lwm/.gitignore create mode 100644 summary/src/lwm/img/mt.tex create mode 100644 summary/src/lwm/img/overview.tex create mode 100644 summary/src/lwm/img/proofcom.pdf create mode 100644 summary/src/lwm/img/proofgen.pdf create mode 100644 summary/src/lwm/img/proofvf.pdf create mode 100644 summary/src/lwm/img/snapshot.pdf create mode 100644 summary/src/lwm/img/wildcard.tex create mode 100644 summary/src/lwm/main.tex create mode 100644 summary/src/lwm/src/abstract.tex create mode 100644 summary/src/lwm/src/acknowledgments.tex create mode 100644 summary/src/lwm/src/background.tex create mode 100644 summary/src/lwm/src/conclusion.tex create mode 100644 summary/src/lwm/src/evaluation.tex create mode 100644 summary/src/lwm/src/introduction.tex create mode 100644 summary/src/lwm/src/lwm.tex create mode 100644 summary/src/lwm/src/references.bib create mode 100644 summary/src/other.tex create mode 100644 summary/src/sammanfattning.tex create mode 100644 summary/src/sauteed/.gitignore create mode 100644 summary/src/sauteed/img/onion-location.pdf create mode 100644 summary/src/sauteed/img/onion-search.pdf create mode 100644 summary/src/sauteed/main.tex create mode 100644 summary/src/sauteed/src/abstract.tex create mode 100644 summary/src/sauteed/src/acks.tex create mode 100644 summary/src/sauteed/src/appendix.tex create mode 100644 summary/src/sauteed/src/conc.tex create mode 100644 summary/src/sauteed/src/intro.tex create mode 100644 summary/src/sauteed/src/preliminaries.tex create mode 100644 summary/src/sauteed/src/refs.bib create mode 100644 summary/src/sauteed/src/related.tex create mode 100644 summary/src/sauteed/src/sauteed.tex create mode 100644 summary/src/tlwo/.gitignore create mode 100644 summary/src/tlwo/img/.gitkeep create mode 100644 summary/src/tlwo/img/attack.pdf create mode 100644 summary/src/tlwo/img/cached.pdf create mode 100644 summary/src/tlwo/img/plot_cache_entries-permissive.pdf create mode 100644 summary/src/tlwo/img/plot_cache_entries-web.pdf create mode 100644 summary/src/tlwo/img/plot_cache_hits-permissive.pdf create mode 100644 summary/src/tlwo/img/plot_cache_hits-web.pdf create mode 100644 summary/src/tlwo/img/plot_lookups-permissive.pdf create mode 100644 summary/src/tlwo/img/plot_lookups-web.pdf create mode 100644 summary/src/tlwo/img/plot_popularity_match-permissive.pdf create mode 100644 summary/src/tlwo/img/plot_popularity_match-web.pdf create mode 100644 summary/src/tlwo/img/plot_preload_entries-permissive.pdf create mode 100644 summary/src/tlwo/img/plot_preload_entries-web.pdf create mode 100644 summary/src/tlwo/img/plot_preload_hits-permissive.pdf create mode 100644 summary/src/tlwo/img/plot_preload_hits-web.pdf create mode 100644 summary/src/tlwo/img/plot_preload_lists-permissive.pdf create mode 100644 summary/src/tlwo/img/plot_preload_lists-web.pdf create mode 100644 summary/src/tlwo/img/preload.pdf create mode 100644 summary/src/tlwo/img/preload.svg create mode 100644 summary/src/tlwo/img/repeat-attack.pdf create mode 100644 summary/src/tlwo/img/resolve.pdf create mode 100644 summary/src/tlwo/img/setting.pdf create mode 100644 summary/src/tlwo/img/uncached.pdf create mode 100644 summary/src/tlwo/main.tex create mode 100644 summary/src/tlwo/src/abstract.tex create mode 100644 summary/src/tlwo/src/acknowledgements.tex create mode 100644 summary/src/tlwo/src/attack.tex create mode 100644 summary/src/tlwo/src/availability.tex create mode 100644 summary/src/tlwo/src/background.tex create mode 100644 summary/src/tlwo/src/conclusion.tex create mode 100644 summary/src/tlwo/src/introduction.tex create mode 100644 summary/src/tlwo/src/long.tex create mode 100644 summary/src/tlwo/src/ref.bib create mode 100644 summary/src/tlwo/src/related.tex create mode 100644 summary/src/tlwo/src/short.tex create mode 100644 summary/src/tlwo/src/tor-cache.tex (limited to 'summary/src') diff --git a/summary/src/abbr.tex b/summary/src/abbr.tex new file mode 100644 index 0000000..ea8bdcd --- /dev/null +++ b/summary/src/abbr.tex @@ -0,0 +1,19 @@ +\section*{List of Acronyms} +The introductory summary refrains from using acronyms to make it easily +digested. However, a few acronyms are used without definition because their +full versions are likely less familiar to most readers. These acronyms are: + +\begin{description} + \item[DNS] Domain Name System + \item[HTML] Hyper Text Markup Language + \item[HTTP] Hyper Text Transfer Protocol + \item[HTTPS] Hyper Text Transfer Protocol Secure + \item[IP] Internet Protocol + \item[P4] Programming Protocol-independent Packet Processors + \item[RFC] Request For Comments + \item[RIPE] R\'{e}seaux IP Europ\'{e}ens + \item[RSA] Rivest Shamir Adleman + \item[TCP] Transmission Control Protocol + \item[TLS] Transport Layer Security + \item[XDP] eXpress Data Path +\end{description} diff --git a/summary/src/abstract.tex b/summary/src/abstract.tex new file mode 100644 index 0000000..0a43c0f --- /dev/null +++ b/summary/src/abstract.tex @@ -0,0 +1,36 @@ +Certificate Transparency is an ecosystem of logs, monitors, and auditors that +hold certificate authorities accountable while issuing certificates. We show +how the amount of trust that TLS clients and domain owners need to place in +Certificate Transparency can be reduced, both in the context of existing gradual +deployments and the largely unexplored area of Tor. Our contributions include + improved third-party monitoring, + a gossip protocol plugging into Certificate Transparency over DNS, + an incrementally deployable gossip-audit model tailored for Tor Browser, and + using certificates with onion addresses. +The methods used range from proof sketches to Internet measurements and +prototype evaluations. An essential part of our evaluation in Tor is to assess +how the protocols used during website visits---such as requesting an inclusion +proof from a Certificate Transparency log---affect unlinkability between senders +and receivers. We find that most false positives in website fingerprinting +attacks can be eliminated for all but the most frequently visited sites. This +is because the destination anonymity set can be reduced due to how Internet +protocols work: communication is observable and often involves third-party +interactions. Some of the used protocols can further be subject to side-channel +analysis. For example, we show that remote (timeless) timing attacks against +Tor's DNS cache reliably reveal the timing of past exit traffic. The severity +and practicality of our extension to website fingerprinting pose threats to the +anonymity provided by Tor. We conclude that access to a so-called website +oracle should be an assumed attacker capability when evaluating website +fingerprinting~defenses. + +\keywords + Auditing, + Certificate Transparency, + DNS, + Gossip, + Side-Channels, + Timing Attacks, + Tor, + Tor Browser, + Website Fingerprinting, + Website Oracles diff --git a/summary/src/acknowledgements.tex b/summary/src/acknowledgements.tex new file mode 100644 index 0000000..c585150 --- /dev/null +++ b/summary/src/acknowledgements.tex @@ -0,0 +1,40 @@ +I am fortunate to be surrounded by people that helped me reach this milestone in +one piece. +First, + I want to acknowledge that my brother, Victor, provided me with unlimited + tutoring before my PhD studies. I would not be on my current path without + him. +Second, + Sarah and Andreas, thank you for challenging and supporting me. You have been + a determinantal part of my personal growth and well-being. +Finally, + Mom, thank you for being around whenever I need it. + +Several doors were opened for me at Karlstad University. Stefan Alfredsson and +Stefan Lindskog kick-started my undergraduate studies with programming feedback, +increased-pace study plans, and the opportunity to be involved in the +department. Tobias Pulls introduced me to computer security research. We have +worked closely ever since, and \emph{I strongly recommend him as an advisor and +collaborator}. Simone Fischer-H\"{u}bner, Stefan Lindskog, Leonardo Martucci, +and Tobias Pulls all provided sincere advice during my PhD studies. Many other +individuals selflessly helped me forward. Some of them include + Ala Sarah Alaqra, + Linus Nordberg, + Jenni Reuben, + Fredrik Str\"{o}mberg, and + Paul Syverson. + +I am further grateful for my +collaborators: + Matthew Finkel, + Toke H{\o}iland-J{\o}rgensen, + Andreas Kassler, + Linus Nordberg, + Tobias Pulls, + Tom Ritter, + Paul Syverson, and + Jonathan Vestin. +They are all driven individuals that I learned a lot from. +I also appreciate the generous funding received from + the Swedish Knowledge Foundation and + the Swedish Foundation for Strategic Research. diff --git a/summary/src/cat/.gitignore b/summary/src/cat/.gitignore new file mode 100644 index 0000000..8bb88c8 --- /dev/null +++ b/summary/src/cat/.gitignore @@ -0,0 +1,9 @@ +main.pdf +*.blg +*.bbl +*.fls +*.fdb_latexmk +*.log +*.out +*.aux +*.swp diff --git a/summary/src/cat/img/aws.pdf b/summary/src/cat/img/aws.pdf new file mode 100644 index 0000000..0c3161e Binary files /dev/null and b/summary/src/cat/img/aws.pdf differ diff --git a/summary/src/cat/img/bt10.pdf b/summary/src/cat/img/bt10.pdf new file mode 100644 index 0000000..23fbd5c Binary files /dev/null and b/summary/src/cat/img/bt10.pdf differ diff --git a/summary/src/cat/img/bt100.pdf b/summary/src/cat/img/bt100.pdf new file mode 100644 index 0000000..b37c523 Binary files /dev/null and b/summary/src/cat/img/bt100.pdf differ diff --git a/summary/src/cat/img/bt1000.pdf b/summary/src/cat/img/bt1000.pdf new file mode 100644 index 0000000..65e51f8 Binary files /dev/null and b/summary/src/cat/img/bt1000.pdf differ diff --git a/summary/src/cat/img/df_nodef.pdf b/summary/src/cat/img/df_nodef.pdf new file mode 100644 index 0000000..f72a9d0 Binary files /dev/null and b/summary/src/cat/img/df_nodef.pdf differ diff --git a/summary/src/cat/img/df_wt.pdf b/summary/src/cat/img/df_wt.pdf new file mode 100644 index 0000000..14a18b0 Binary files /dev/null and b/summary/src/cat/img/df_wt.pdf differ diff --git a/summary/src/cat/img/df_wtfpad.pdf b/summary/src/cat/img/df_wtfpad.pdf new file mode 100644 index 0000000..e425ae8 Binary files /dev/null and b/summary/src/cat/img/df_wtfpad.pdf differ diff --git a/summary/src/cat/img/dns__classifier-idea.pdf b/summary/src/cat/img/dns__classifier-idea.pdf new file mode 100644 index 0000000..bf68fff Binary files /dev/null and b/summary/src/cat/img/dns__classifier-idea.pdf differ diff --git a/summary/src/cat/img/dns__timing-dist.pdf b/summary/src/cat/img/dns__timing-dist.pdf new file mode 100644 index 0000000..eca8a0a Binary files /dev/null and b/summary/src/cat/img/dns__timing-dist.pdf differ diff --git a/summary/src/cat/img/dynaflow_config1.pdf b/summary/src/cat/img/dynaflow_config1.pdf new file mode 100644 index 0000000..de82dc4 Binary files /dev/null and b/summary/src/cat/img/dynaflow_config1.pdf differ diff --git a/summary/src/cat/img/dynaflow_config2.pdf b/summary/src/cat/img/dynaflow_config2.pdf new file mode 100644 index 0000000..e995173 Binary files /dev/null and b/summary/src/cat/img/dynaflow_config2.pdf differ diff --git a/summary/src/cat/img/dynaflow_nodef.pdf b/summary/src/cat/img/dynaflow_nodef.pdf new file mode 100644 index 0000000..c481a3d Binary files /dev/null and b/summary/src/cat/img/dynaflow_nodef.pdf differ diff --git a/summary/src/cat/img/factor-fnp.pdf b/summary/src/cat/img/factor-fnp.pdf new file mode 100644 index 0000000..933d1ce Binary files /dev/null and b/summary/src/cat/img/factor-fnp.pdf differ diff --git a/summary/src/cat/img/factor-recall.pdf b/summary/src/cat/img/factor-recall.pdf new file mode 100644 index 0000000..2e017db Binary files /dev/null and b/summary/src/cat/img/factor-recall.pdf differ diff --git a/summary/src/cat/img/probfp.pdf b/summary/src/cat/img/probfp.pdf new file mode 100644 index 0000000..81e8de8 Binary files /dev/null and b/summary/src/cat/img/probfp.pdf differ diff --git a/summary/src/cat/img/setting-oracle.pdf b/summary/src/cat/img/setting-oracle.pdf new file mode 100644 index 0000000..4620d67 Binary files /dev/null and b/summary/src/cat/img/setting-oracle.pdf differ diff --git a/summary/src/cat/img/setting.pdf b/summary/src/cat/img/setting.pdf new file mode 100644 index 0000000..1004bf1 Binary files /dev/null and b/summary/src/cat/img/setting.pdf differ diff --git a/summary/src/cat/img/timeuntilvisited.pdf b/summary/src/cat/img/timeuntilvisited.pdf new file mode 100644 index 0000000..188746b Binary files /dev/null and b/summary/src/cat/img/timeuntilvisited.pdf differ diff --git a/summary/src/cat/img/wang_csbuflo.pdf b/summary/src/cat/img/wang_csbuflo.pdf new file mode 100644 index 0000000..0ec042b Binary files /dev/null and b/summary/src/cat/img/wang_csbuflo.pdf differ diff --git a/summary/src/cat/img/wang_nodef.pdf b/summary/src/cat/img/wang_nodef.pdf new file mode 100644 index 0000000..7dd0023 Binary files /dev/null and b/summary/src/cat/img/wang_nodef.pdf differ diff --git a/summary/src/cat/img/wang_tamaraw.pdf b/summary/src/cat/img/wang_tamaraw.pdf new file mode 100644 index 0000000..7f902ff Binary files /dev/null and b/summary/src/cat/img/wang_tamaraw.pdf differ diff --git a/summary/src/cat/main.tex b/summary/src/cat/main.tex new file mode 100644 index 0000000..5dd9d84 --- /dev/null +++ b/summary/src/cat/main.tex @@ -0,0 +1,70 @@ +\begin{kaupaper}[ + author={% + Tobias Pulls and \textbf{Rasmus Dahlberg} + }, + title={% + Website Fingerprinting with Website Oracles + }, + reference={% + PETS (2020) + }, + summary={% + One of the properties Tor aims to provide against local network attackers + is unlinkability between end-users (sender anonymity set) and their + destinations on the Internet (receiver anonymity set). A website + fingerprinting attack aims to break anonymity in this model by inferring + which website an identifiable end-user is visiting based only on the + traffic entering the Tor network. We extend the attacker model for + website fingerprinting attacks by introducing the notion of \emph{website + oracles}. A website oracle answers the following question: was website $w$ + visited during time frame $t$? In other words, the attacker can query the + receiver anonymity set for websites that were (not) visited. Our + simulations show that augmenting past website fingerprinting attacks to + include website oracles significantly reduces false positives for all but + the most popular websites, e.g., to the order of $10^{-6}$ for + classifications around Alexa top-10k and much less for the long tail of + sites. Further, some earlier website fingerprinting defenses are largely + ineffective in the (stronger) attacker model that includes website + oracles. We discuss a dozen real-world website oracles ranging from + centralized access logs to widely accessible real-time bidding platforms + and DNS caches, arguing that they are inherent parts of the protocols used + to perform website visits. Therefore, access to a website oracle should + be an assumed attacker capability when evaluating which website + fingerprinting defenses are effective. + }, + participation={\vspace{-.25cm} + Tobias is the main author and conducted most of the work. I mainly + contributed by coining the name \emph{website oracle}, evaluating + sources of real-world website oracles, and performing our non-simulated + network experiments. + }, + label={ + paper:cat + }, +] + \maketitle + \begin{abstract} + \input{src/cat/src/abstract} + \end{abstract} + + \input{src/cat/src/intro} + \input{src/cat/src/background} + \input{src/cat/src/oracles} + \input{src/cat/src/sources} + \input{src/cat/src/sim} + \input{src/cat/src/wf} + \input{src/cat/src/discussion} + \input{src/cat/src/related} + \input{src/cat/src/conclusions} + \input{src/cat/src/ack} + + \bibliographystyle{plain} + \bibliography{src/cat/src/ref-min} + + \begin{appendices} + \input{src/cat/src/bayes} + \input{src/cat/src/lessons} + \input{src/cat/src/othersources} + \end{appendices} + +\end{kaupaper} diff --git a/summary/src/cat/src/abstract.tex b/summary/src/cat/src/abstract.tex new file mode 100644 index 0000000..da09599 --- /dev/null +++ b/summary/src/cat/src/abstract.tex @@ -0,0 +1,25 @@ +\noindent +Website Fingerprinting (WF) attacks are a subset of traffic analysis attacks +where a local passive attacker attempts to infer which websites a target victim +is visiting over an encrypted tunnel, such as the anonymity network Tor. We +introduce the security notion of a \emph{Website Oracle} (WO) that gives a WF +attacker the capability to determine whether a particular monitored website was +among the websites visited by Tor clients at the time of a victim's trace. Our +simulations show that combining a WO with a WF attack---which we refer to as a +WF+WO attack---significantly reduces false positives for about half of all +website visits and for the vast majority of websites visited over Tor. The +measured false positive rate is on the order one false positive per million +classified website trace for websites around Alexa rank 10,000. Less popular +monitored websites show orders of magnitude lower false positive rates. + +{\setlength{\parindent}{6mm} We argue that WOs are inherent to the setting of +anonymity networks and should be an assumed capability of attackers when +assessing WF attacks and defenses. Sources of WOs are abundant and available to +a wide range of realistic attackers, e.g., due to the use of DNS, OCSP, and +real-time bidding for online advertisement on the Internet, as well as the +abundance of middleboxes and access logs. Access to a WO indicates that the +evaluation of WF defenses in the open world should focus on the highest possible +recall an attacker can achieve. Our simulations show that augmenting the Deep +Fingerprinting WF attack by Sirinam \emph{et~al.}~\cite{DF} with access to a WO +significantly improves the attack against five state-of-the-art WF defenses, +rendering some of them largely ineffective in this new WF+WO setting.} diff --git a/summary/src/cat/src/ack.tex b/summary/src/cat/src/ack.tex new file mode 100644 index 0000000..4008faf --- /dev/null +++ b/summary/src/cat/src/ack.tex @@ -0,0 +1,9 @@ +\section*{Acknowledgements} +We would like to thank Jari Appelgren, Roger Dingledine, Nicholas Hopper, Marc +Juarez, George Kadianakis, Linus Nordberg, Mike Perry, Erik Wästlund, and the +PETS reviewers for their valuable feedback. Simulations were performed using the +Swedish National Infrastructure for Computing (SNIC) at +High Performance Computing Center North (HPC2N) +This research was funded by the +Swedish Internet Foundation and the +Knowledge Foundation of Sweden. diff --git a/summary/src/cat/src/background.tex b/summary/src/cat/src/background.tex new file mode 100644 index 0000000..bb64337 --- /dev/null +++ b/summary/src/cat/src/background.tex @@ -0,0 +1,211 @@ +\section{Background} \label{cat:sec:back} +Here we present background on terminology, the anonymity network Tor, +and WF attacks and defenses. + +\subsection{Anonymity and Unobservability} +Anonymity is the state of a subject not being identifiable from an attackers +perspective within the \emph{anonymity set} of possible subjects that performed +an action such as sending or receiving a message~\cite{anonterm}. For an +anonymity network, an attacker may not be able to determine who sent a message +into the network---providing a sender anonymity set of all possible +senders---and conversely, not be able to determine the recipient of a message +from the network out of all possible recipients in the recipient anonymity set. +Inherent for anonymity is that the subjects in an anonymity set change based on +what the attacker observes, e.g., when some subjects send or receive +messages~\cite{KedoganAP02,Raymond00}. In gist, anonymity is concerned with +hiding the \emph{relationship} between a sender and recipient, not its +existence. + +Unobservability is a strictly stronger notion than +anonymity~\cite{KedoganAP02,anonterm,Raymond00}. In addition to anonymity of the +relationship between a sender and recipient, unobservability also requires that +an attacker (not acting as either the sender or recipient) cannot sufficiently +distinguish if there is a sender or recipient or not~\cite{anonterm}. Perfect +unobservability is therefore the state of an attacker being unable to determine +if a sender/recipient should be part of the anonymity set or not. + +\subsection{Tor} +Tor is a low-latency anonymity network for anonymising TCP streams with about +eight million daily users, primarily used for anonymous browsing, censorship +circumvention, and providing anonymous (onion) services~\cite{tor,torusage}. +Because Tor is designed to be usable for low-latency tasks such as web browsing, +its threat model and design does not consider powerful attackers, e.g., global +passive adversaries that can observe all network traffic on the +Internet~\cite{trilemma,tor}. However, less powerful attackers such as ISPs and +ASes that observe a fraction of network traffic on the Internet are in scope. + +Users typically use Tor Browser---a customised version of Mozilla Firefox +(bundled with a local relay)---as a client that sends traffic through three +\emph{relays} when browsing a website on the regular Internet: a guard, middle, +and exit relay. Traffic from the client to the exit is encrypted in multiple +layers as part of fixed-size cells such that only the guard relay knows the +IP-address of the client and only the exit relay knows the destination website. +There are about 7000 public relays at the time of writing, all available in the +\emph{consensus} generated periodically by the network. The consensus is public +and therefore anyone can trivially determine if traffic is coming from the Tor +network by checking if the IP-address is in the consensus. Note that the +encrypted network traffic in Tor is exposed to network adversaries as well as +relays as it traverses the Internet. Figure~\ref{cat:fig:setting} depicts the +setting just described, highlighting the anonymity sets of users of Tor Browser +and the possible destination websites. + +\begin{figure}[!t] + \centering + \includegraphics[width=.85\columnwidth]{src/cat/img/setting} + \caption{Using Tor to browse to a website, where an attacker observes the encrypted traffic into the Tor network for a target user, attempting to determine the website the user is visiting.} + \label{cat:fig:setting} +\end{figure} + +\subsection{Website Fingerprinting} +\label{cat:sec:back:wf} +As mentioned in the introduction, attacks that analyse the encrypted network +traffic (a trace) between a Tor client and a guard relay with the goal to detect +the website a client is visiting are referred to as \emph{website +fingerprinting} (WF) attacks. Figure~\ref{cat:fig:setting} shows the typical +location of the attacker, who can also be the guard itself. WF attacks are +evaluated in either the \emph{closed} or the \emph{open} world. In the closed +world, an attacker \emph{monitors} a number of websites and it is the goal of +the attacker to determine which website out of all the possible monitored +websites a target is visiting. The open world is like the closed world with one +significant change: the target user may also visit \emph{unmonitored} websites. +This means that in the open world the attacker may also classify a trace as +unmonitored in addition to monitored, posing a significantly greater challenge +for the attacker in a more realistic setting than the closed world. The ratio +between monitored and unmonitored traces in a dataset is further a significant +challenge for WF attacks when assessing their real-world significance for +Tor~\cite{DBLP:conf/ccs/JuarezAADG14}. Typically, WF attacks are evaluated on +the frontpages of web\emph{sites}: web\emph{page} fingerprinting is presumably +much more challenging due to the orders of magnitude of more webpages than +websites. Unless otherwise stated, we only consider the frontpages of websites +in this paper. + +\subsubsection{Website Fingerprinting Attacks} +Prior to WF attacks being considered for use on Tor, they were used against +HTTPS~\cite{cheng1998traffic}, web +proxies~\cite{Hintz02,DBLP:conf/sp/SunSWRPQ02}, SSH +tunnels~\cite{DBLP:conf/ccs/LiberatoreL06}, and VPNs~\cite{HerrmannWF09}. For Tor, WF attacks are typically based on machine learning and can be +categorized based on if they use deep learning or not. + +Traditional WF attacks in the literature use manually engineered features +extracted from both the size and timing of packets (and/or cells) sent by Tor. +State of the art attacks with manually engineered features are +Wang-kNN~\cite{Wang}, CUMUL~\cite{cumul}, and k-FP~\cite{kfp}. For reference, +Wang-kNN has 1225 features, CUMUL 104 features, and k-FP 125 features. In terms +of accuracy, k-FP appears to have a slight edge over the other two, but all +three report over 90\% accuracy against significantly sized datasets. As +traditional WF attacks progressed, the features more than the type of machine +learning method have shown to be vital for the success of attacks, with an +emerging consensus on what are important features (e.g., coarse features like +number of incoming and outgoing packets)~\cite{Cherubin17,kfp,cumul}. + +Deep learning was first used for WF attacks by Abe and Goto in +2016~\cite{abe2016fingerprinting}. Relatively quickly, Rimmer \emph{et~al.} +reached parity with traditional WF attacks, lending credence to the emerging +consensus that the research community had found the most important features for +WF~\cite{DBLP:conf/ndss/RimmerPJGJ18}. However, recently Sirinam et +al.~\cite{DF} with Deep Fingerprinting (DF) significantly improved on other WF +attacks, also on the WTF-PAD and Walkie-Talkie defenses, and is at the time of +writing considered state-of-the-art. DF is based on a Convolutional Neural +Network (CNN) with a customized architecture for WF. Each packet trace as input +to DF is simply a constant size (5000) list of cells (or packets) and their +direction (positive for outgoing, negative for incoming), ignoring size and +timings. Based on the input, the CNN learns features on its own: we do not know +what they are, other than preliminary work indicating that the CNN gives more +weight to input early in the trace~\cite{mathews2018understanding}. + +The last layer of the CNN-based architecture of DF is a \emph{softmax} function: +it assigns (relative) probabilities to each class as the output of +classification. These probabilities allow a threshold to be defined for the +final classification in the open world, requiring that the probability of the +most likely class is above the threshold to classify as a monitored website. + +\subsubsection{Website Fingerprinting Defenses} +WF defenses for Tor modify the timing and number of (fixed-size) cells sent over +Tor when a website is visited. The modifications are done by injecting dummy +traffic and introducing artificial delays. Defenses can typically be classified +as either based on constant-rate traffic or not, where constant rate defenses +force all traffic to fit a pre-determined structure, forming \emph{collision +sets} for websites where their traffic traces appear identical to an attacker. +Non-constant rate defenses simply more-or-less randomly inject dummy traffic +and/or artificial delays with the hope of obfuscating the resulting network +traces. WF defenses are typically compared in terms of their induced +\emph{bandwidth} (BOH) and \emph{time} (TOH) overheads compared to no defense. +Further, different WF defenses make more or less realistic and/or practical +assumptions, making comparing overheads necessary but not nearly sufficient for +reaching conclusions. + +We briefly describe WF defenses that we later use to evaluate the impact of +attackers performing enhanced WF attacks with access to WOs: +\begin{description} + \item[Walkie-Talkie] by Wang and Goldberg~\cite{WT} puts Tor Browser into + half duplex mode and pads traffic such that different websites result in the + same cell sequences. This creates a collision set between a visited website + and a target \emph{decoy website} which results in the same cell sequence + with the defense. Their evaluation shows 31\% BOH and 34\% TOH. Collision + sets grow beyond size two at the cost of BOH. + \item[WTF-PAD] by Juarez \emph{et~al.}~\cite{wtf-pad} is based on the idea + of \emph{adaptive padding} \cite{DBLP:conf/esorics/ShmatikovW06} where fake + padding is injected only when there is no real traffic to send. The defense + is simulated on collected packet traces and its design is the foundation of + the circuit padding framework recently implemented in Tor. The simulations + report 50-60\% BOH and 0\% TOH. + \item[CS-BuFLO] by Cai \emph{et~al.}~\cite{csbuflo} is a \emph{constant rate} + defense where traffic is always sent at a constant rate between a sender and + receiver, improving on prior work by Dyer~et + al.~\cite{DBLP:conf/sp/DyerCRS12}. Their evaluation shows 220-270\% BOH and + 270-340\% TOH. + \item[Tamaraw] by Cai \emph{et~al.}~\cite{Tamaraw} is another constant rate defense + that further improves on CS-BuFLO. In the evaluation by Wang and Goldberg, + they report 103\% BOH and 140\% TOH for Tamaraw~\cite{WT}. + \item[DynaFlow] by Lu \emph{et~al.}~\cite{DynaFlow} is a \emph{dynamic} + constant-rate defense that allows for the defense to adjust its parameters + (notably the ``inter-packet interval'') based on configuration and on the + observed traffic. The evaluation shows an overall improvement over Tamaraw + when configured to use similar overheads. +\end{description} +The primary downside of defenses like Walkie-Talkie that depend on creating +collision sets for websites is that they require up-to-date knowledge of the +target website(s) to create collisions with (to know how to morph the traffic +traces): this is a significant practical issue for +deployment~\cite{DBLP:conf/wpes/NithyanandCJ14,Wang,WT}. Constant rate defenses +like CS-BuFLO and Tamaraw are easier to deploy but suffer from significant +overheads~\cite{csbuflo,Tamaraw}. WTF-PAD is hard to implement both efficiently +and effectively in practice due to only being simulated on packet traces as-is +and also being vulnerable to attacks like Deep Fingerprinting~\cite{wtf-pad,DF}. +While DynaFlow shows great promise, but requires changes at the client (Tor +Browser, local relay, or both) and at exit relays to \emph{combine} packets with +payloads smaller than Tor's cell size~\cite{DynaFlow}. Without combined packets +its advantage in terms of overhead compared to Tamaraw likely shrinks. + +\subsection{Challenges for WF Attacks in Practice} +A number of practical challenges for an attacker performing WF attacks have been +highlighted over the years, notably comprehensively so by Mike Perry of the Tor +Project~\cite{perryCrit} and Juarez et +al.~\cite{DBLP:conf/ccs/JuarezAADG14}. Wang and Goldberg have showed that +several of the highlighted challenges---such as maintaining a fresh data set and +determining when websites are visited---are practical to overcome~\cite{DBLP:journals/popets/WangG16}. What remains are two notably +significant challenges: distinguishing between different goals of the attacker +and addressing false positives. + + +For attacker goals when performing WF attacks, an attacker may want to detect +website visits with the goal of censoring access to it, to identify all users +that visit particular websites, or to identify every single website visited by +a target~\cite{perryCrit}. Clearly, these goals put different +constraints on the attacker. For censorship, classification must happen before +content is actually allowed to be completely transferred to the victim. For +monitoring only a select number of websites the attacker has the most freedom, +while detecting all website visits by a victim requires the attacker to have +knowledge of all possible websites on the web. + +For addressing false positives there are a number of aspects to take into +account. First, the web has millions of websites that could be visited by a +victim (not the case for onion services~\cite{JansenJGED18}), and each website +has a significant number of webpages that are often dynamically generated and +frequently changed~\cite{DBLP:conf/ccs/JuarezAADG14,perryCrit}. Secondly, how +often victims potentially visit websites that are monitored by an attacker is +unknown to the attacker, i.e., the \emph{base rate} of victims are unknown. The +base rate leads to even a small false positive rate of a WF attack overwhelming +an attacker with orders of magnitude more false positives than true positives, +leaving WF attacks impractical for most attacker goals in practice. + diff --git a/summary/src/cat/src/bayes.tex b/summary/src/cat/src/bayes.tex new file mode 100644 index 0000000..344d712 --- /dev/null +++ b/summary/src/cat/src/bayes.tex @@ -0,0 +1,96 @@ +\section{Bayes' Law for Estimating Utility of Website Oracles} \label{cat:app:bayes} + +To reason about the advantage to an attacker of having access to a WO, we +estimate the conditional probability of a target user visiting a monitored +website. For conditional probability we know that: +\begin{equation} + P(C_0 \cap C_1) = P(C_0|C_1)P(C_1) +\end{equation} + +For a hypothesis $H$ given conditional evidence $E$, Bayes' theorem states +that: +\begin{equation} + P(H|E) = \frac{P(E|H) P(H)}{P(E)} +\end{equation} + +Assume that E = $E_0 \cap E_1$, then: +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(E_0 \cap E_1|H) P(H)}{P(E_0 \cap E_1)} +\end{equation} + +Substituting $P(E_0 \cap E_1)$ with (1) we get: +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(E_0 \cap E_1|H) P(H)}{P(E_0|E_1)P(E_1)} +\end{equation} + +For a timeframe $t$, we define +\begin{description} + \item[$H$] the probability that target user(s) visited website $w$ over Tor in $t$ + \item[$E_0$] the probability that target user(s) visited a website over Tor in $t$ + \item[$E_1$] the probability that someone visited website $w$ over Tor in $t$ +\end{description} + +We see that $P(E_0 \cap E_1|H) = 1$ by definition and get: +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(H)}{P(E_0|E_1)P(E_1)} +\end{equation} + +Consider $P(E_0|E_1)$: while the conditional $E_1$ may have some minor affect on +user behaviour (in particular for overall popular websites), we assume that the +popularity of using Tor to visit a particular website (by any of the users of +Tor) has negligible impact on $E_0$ and treat $E_0$ and $E_1$ as independent: + +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(H)}{P(E_0)P(E_1)} +\end{equation} + +We can further refine $P(H)$ as being composed of at least: + +\begin{equation} + P(H) = P(E_0) \cap P(B_w) = P(E_0|B_w)P(B_w) +\end{equation} + +Where $P(B_w)$ is the base rate (prior) of the user(s) visiting website $w$ out +of all possible websites they visit ($P(E_0)$). We again assume (perhaps +naively) that $E_0$ is also independent of $B_w$, which gives us: + +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(E_0)P(B_w)}{P(E_0)P(E_1)} = \frac{P(B_w)}{P(E_1)} +\end{equation} + +In other words, if an attacker learns that target user(s) visited a website +($E_0$) over Tor and that website $w$ was also visited over Tor by some user +($E_1$), then we can estimate the probability that it was target user(s) that +visited website $w$ ($H$) as the ratio between the base rate (prior) for +visiting $w$ of target user(s) ($B_w$) and the probability that someone visited +the website over Tor ($E_1$), all within a timeframe $t$. + +Figure~\ref{cat:fig:bt} shows the results for simulating the probability $P(H|E_0 +\cap E_1)$ for different website popularities of $w$, base rates, and +timeframes. We see that with a realistic timeframe of $100$ ms, for all +base-rates but $10^{-6}$ there is non-zero conditional probability (and +therefore utility of WO access) for Alexa top 100k or less popular websites, +which covers about half of all website visits over Tor (excluding +\url{torproject.org}). + +\begin{figure}[!t] + \centering + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/bt10} + \caption{10 ms} + \label{cat:fig:bt:10ms} + \end{subfigure} + \begin{subfigure}[b]{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/bt100} + \caption{100 ms} + \label{cat:fig:bt:100ms} + \end{subfigure} + \begin{subfigure}[b]{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/bt1000} + \caption{1000 ms} + \label{cat:fig:bt:1000ms} + \end{subfigure} + \caption{The conditional probability as a function of user base rate and + website popularity (Alexa) for three different timeframes.} + \label{cat:fig:bt} +\end{figure} diff --git a/summary/src/cat/src/conclusions.tex b/summary/src/cat/src/conclusions.tex new file mode 100644 index 0000000..8554d23 --- /dev/null +++ b/summary/src/cat/src/conclusions.tex @@ -0,0 +1,25 @@ +\section{Conclusions} \label{cat:sec:conc} +WF+WO attacks use the base rate of all users of the network against victims, +significantly reducing false positives in the case of all but the most popular +websites visited over Tor. This is troubling in many ways, in part because +presumably many sensitive website visits are to unpopular websites used only by +local communities in regions of the world where the potential consequences of +identification are the worst. + +The threat model of Tor explicitly states that Tor does not consider attackers +that can observe both incoming and outgoing traffic~\cite{tor}. Clearly, a WO +gives the capability to infer what the outgoing traffic of the network encodes +on the application layer (the website visits). This is in a sense a violation of +Tor's threat model when combined with a WF attacker that also observes incoming +traffic. However, we argue that because of the plethora of possible ways for an +attacker to infer membership in the anonymity sets of Tor, WOs should be +considered within scope simply because Tor asserts that it is an anonymity +network. + +While the real-world impact of WF attacks on Tor users remains an open question, +our simulations show that false positives can be signficantly reduced by many +attackers with little extra effort for some WO sources. Depending on WO source, +this comes at a trade-off of less recall. For many attackers and attacker goals, +however, this is a worthwhile trade. To us, the threat of WF attacks appears +more real than ever, especially when also considering recent advances by deep +learning based attacks like DF~\cite{DF} and DeepCorr~\cite{deepcorr}. diff --git a/summary/src/cat/src/discussion.tex b/summary/src/cat/src/discussion.tex new file mode 100644 index 0000000..0de8584 --- /dev/null +++ b/summary/src/cat/src/discussion.tex @@ -0,0 +1,153 @@ +\section{Discussion} \label{cat:sec:disc} +For defenses that are based on the idea of creating collision sets between +packet traces generated by websites, oracle access is equivalent to being able +to perform set intersection between the set of websites in a collision set and +monitored websites visited at the time of fingerprinting. As the results show +from Section~\ref{cat:sec:wf}, some defenses can significantly reduce the recall of +WF attacks with WOs, but not the precision for the majority of websites and +website visits in Tor. + +Next, in Section~\ref{cat:sec:disc:unmon} we further cement that our simulations +show that WOs significantly reduces false positives, highlighting that a WF+WO +attacker surprisingly infrequently have to query a WO when classifying +unmonitored traces. Section~\ref{cat:sec:disc:flawed} discusses the impact of +imperfect WO sources with limited observability and false positives on the joint +WF+WO attack. Finally, Section~\ref{cat:sec:disc:limits} covers limitations of our +work, and Section~\ref{cat:sec:disc:miti} discusses possible mitigations. + +\subsection{A Large Unmonitored Dataset} +\label{cat:sec:disc:unmon} +We look at the number of false positives for a large test dataset consisting of +only unmonitored websites (representing a target user(s) with base rate 0, i.e., +that never visits any monitored website). Using the dataset of Greschbach et +al.~\cite{DBLP:conf/ndss/GreschbachPRWF17}, we trained DF on 100x100 monitored +and 10k unmonitored websites (8:2 stratified split for validation), resulting in +about 80\% validation accuracy after 30 epochs (so DF is clearly successful also +on this dataset). We then tested the trained classifier on \emph{only} 100k +\emph{unmonitored} traces, with and without oracle access (100ms resolution) for +different assumptions of the popularity of the \emph{monitored} websites. With +ten repetitions of the above experiment, we observed a false positive rate in +the order of $10^{-6}$ for monitored websites with Alexa popularity 10k. +Excluding \url{torproject.org}, this indicates that an attacker would have close +to no false positives for about half of all website visits in Tor, according to +the distribution of Mani \emph{et~al.}~\cite{torusage} (see +Section~\ref{cat:sec:sim:dist}). Without access to a WO, DF had a false positive +rate in the order of $10^{-3}$ to $10^{-4}$, depending on the threshold used by +the attacker. + +Recall how WOs are used as part of WF+WO attacks in +Section~\ref{cat:sec:oracles:generic}: the WO is only used if the WF attack +classifies a trace as monitored.\footnote{For WF attacks like DF that produces a +list of probabilities (Definition~\ref{cat:def:oracleprob}), just assume that the +attacker picks the threshold and checks if the probability is above as part of +the if-statement before using the WO.} This means that, in the example above, +the WO is used on average every $10^3$ to $10^4$ trace, to (potentially) rule +out a false positive. Clearly, this means that WO sources that can only be used +infrequently, e.g., due to caching as in DNS, are still valuable for an +attacker. + +\subsection{Imperfect Website Oracle Sources} +\label{cat:sec:disc:flawed} +Our analysis considered an ideal source of a WO that observes all visits to +targeted \emph{monitored} websites of the attacker and that produces no false +positives. Next, using the same dataset and setup as for +Figure~\ref{cat:fig:df:nodef} with an Alexa starting rank of $10^4$, we simulate the +impact on recall and the False-Negative-to-Positive-rate\footnote{For a +classifier with multiple monitored classes and an unmonitored class (as for our +modified DF, see Section~\ref{cat:sec:wf:aug}), FNP captures the case when the +classifier classifies an unmonitored testing trace as any monitored class. In +addition to FNP, such a classifier can also confuse one monitored website for +another. Both these cases are false positives~\cite{Wang2015a}.} (FNP) of the +\emph{joint} WF+WO attack for five false positive rates \emph{of the WO} and a +fraction of observed website visits. + +Figure~\ref{cat:fig:factor-recall} shows the impact on the joint recall in the above +setting. We see that recall is directly proportional to the fraction of observed +visits, as per the results of Greschbach +\emph{et~al.}~\cite{DBLP:conf/ndss/GreschbachPRWF17}. Further, false positives +for the WO have a positive impact on the fraction of recall, counteracting +visits missed due to limited observability. For the same reason, a larger +timeframe or monitoring more popular websites would also improve recall. + +\begin{figure}[!t] + \centering + \includegraphics[width=.67\columnwidth]{src/cat/img/factor-recall} + \caption{How limited WO observability effects the final recall of a WF+WO attack for five different WO false positive rates.} + \label{cat:fig:factor-recall} +\end{figure} + +Figure~\ref{cat:fig:factor-fnp} shows the impact on the joint FNP. Note that lower +FNP is better for an attacker. We see that limited observability has no impact +on FNP. This makes sense, because a WO cannot confirm anything it does not +observe. The FNP fraction for the joint FNP is roughly proportional to the FP of +the WO. We also see that the FNP fraction consistently is above the FP of the +WO: this is because---beyond the simulated FP---there is a slight probability +that someone else (in our simulation of the Tor network) visited the website for +each classified trace. A larger timeframe or monitoring more popular websites +would also increase FNP. + +\begin{figure}[!t] + \centering + \includegraphics[width=.67\columnwidth]{src/cat/img/factor-fnp} + \caption{How limited WO observability effects the final False-Negative-to-Positive-rate (FNP) of a WF+WO attack for five different WO false positive rates. Lower is better.} + \label{cat:fig:factor-fnp} +\end{figure} + +From above results, our simulations indicate that even with a deeply imperfect +WO source an attacker can get significant advantage in terms of reduced false +positives at a comparatively small cost of recall. For example, given a WO with +50\% observability and false positives, the resulting WF+WO attack has about +75\% of the recall of the WF attack and slightly more than half the false +positives. + +\subsection{Limitations} +\label{cat:sec:disc:limits} +As discussed in Section~\ref{cat:sec:back:wf}, there are a number of practical +limitations in general for WF attacks. Regarding attacker goals, WOs are likely +less useful for the purpose of censorship than for other goals. Many sources of +WOs cannot be accessed in real-time, giving little utility for an attacker that +needs to make a near real-time censorship decision. An attacker that only wants +to detect visits to a few selected monitored websites gains significant utility +from WOs, as long as the detection does not have to be in real-time. It is also +noteworthy that an attacker that wants to detect all possible website visits by +a victim can use the WO to in essence ``close the world'' from all possible +websites to only those visited over Tor while the victim is actively browsing. +Granted, this requires a source for the WO that is slightly different from our +definition, but some do offer this: e.g., an attacker that gains comprehensive +control over the DNS resolvers used by Tor +exits~\cite{DBLP:conf/ndss/GreschbachPRWF17}. + +When it comes to false positives a significant limitation of our simulations is +that we consider fingerprinting the frontpages of websites and not specific +webpages. Several sources or WOs are not able to detect webpage visits. This is +also true for subsequent webpage visits on the same website after first visiting +the frontpage of a website (e.g., DNS and OCSP will be cached). An attacker with +the goal of detecting each such page visit will thus suffer more false positives +or fail at detecting them for some sources of WOs. + +\subsection{Mitigations} +\label{cat:sec:disc:miti} +The best defense against WOs is WF defenses that significantly reduce the recall +of WF attacks. In particular, if an attacker can significantly reduce the +website anonymity set \emph{after} accounting for information from the WO, then +attacks are likelier to succeed. This implies that most websites need to (at +least have the potential to) result in the same network traces, as we see with +DynaFlow, Tamaraw, and CS-BuFLO. + +For onion websites we note that the DHT source of a WO from +Section~\ref{cat:sec:sources} is inherent to the design of onion services in Tor. +Defenses that try to make it harder to distinguish between regular website +visits and visits to onion websites should also consider this WO source as part +of their analysis, in particular for v2 onion services. + +Finally, some sources of WOs could be minimized. If you run a potentially +sensitive website: do not use RTB ads, staple OCSP, have as few DNS entries as +possible\footnote{As noted by +Greschbach~\emph{et~al.}~\cite{DBLP:conf/ndss/GreschbachPRWF17}, websites may +have several unique domain names. Each of those could be used independently to +query several sources (e.g., DNS) of WOs.} with a high TTL, do not use CDNs, do +not retain any access logs, and consider if your website, web server, or +operating system have any information leaks that can be used as an oracle. If +you run a Tor exit, consider not using Google or Cloudflare for your DNS but +instead use your ISP's resolver if +possible~\cite{DBLP:conf/ndss/GreschbachPRWF17}. diff --git a/summary/src/cat/src/intro.tex b/summary/src/cat/src/intro.tex new file mode 100644 index 0000000..b65baa9 --- /dev/null +++ b/summary/src/cat/src/intro.tex @@ -0,0 +1,122 @@ +\section{Introduction} \label{cat:sec:intro} +A Website Fingerprinting (WF) attack is a type of traffic analysis attack where +an attacker attempts to learn which websites are visited through encrypted +network tunnels---such as the low-latency anonymity network Tor~\cite{tor} or +Virtual Private Networks (VPNs)---by analysing the encrypted network +traffic~\cite{cheng1998traffic,HerrmannWF09,Hintz02,DBLP:conf/ccs/LiberatoreL06,PanchenkoNZE11,DBLP:conf/sp/SunSWRPQ02}. +The analysis considers only the size and timing of encrypted packets sent over +the network to and from a target client. This makes it possible for attackers +that only have the limited \emph{capability} of observing the encrypted network +traffic (sometimes referred to as a \emph{local eavesdropper}) to perform WF +attacks. Sources of such capabilities include ISPs, routers, network interface +cards, WiFi hotspots, and guard relays in the Tor network, among others. Access +to encrypted network traffic is typically not well-protected over the Internet +because it is already in a form that is considered safe to expose to attackers +due to the use of encryption. + +The last decade has seen significant work on improved WF attacks (e.g., +\cite{CaiZJJ12,kfp,DF,Wang}) and defenses (e.g, +\cite{csbuflo,Tamaraw,wtf-pad,DynaFlow}) accompanied by an ongoing debate on the +real-world impact of these attacks justifying the deployment of defenses or not, +in particular surrounding Tor (e.g., +\cite{DBLP:conf/ccs/JuarezAADG14,perryCrit,DBLP:journals/popets/WangG16}). +There are significant real-world challenges for an attacker to successfully +perform WF attacks, such as the sheer size of the web (about 200 million active +websites~\cite{netcraft-survey}), +detecting the beginning of website loads in encrypted network traces, background +traffic, maintaining a realistic and fresh training data set, and dealing with +false positives. + +Compared to most VPN implementations, Tor has some basic but rather ineffective +defenses in place against WF attacks, such as padding packets to a constant size +and randomized HTTP request pipelining~\cite{CaiZJJ12,tor,Wang}. Furthermore, +Tor recently started implementing a framework for circuit padding machines to +make it easier to implement traffic analysis defenses~\cite{tor-0401} +based on adaptive padding~\cite{wtf-pad,DBLP:conf/esorics/ShmatikovW06}. +However, the unclear real-world impact of WF attacks makes deployment of +proposed effective (and often prohibitively costly in terms of bandwidth and/or +latency overheads) WF defenses a complicated topic for researchers to reach +consensus on and the Tor Project to decide upon. + +\subsection{Introducing Website Oracles} +In this paper, we introduce the security notion of a \emph{Website Oracle} (WO) +that can be used by attackers to augment any WF attack. A WO answers ``yes'' or +``no'' to the question ``was a particular website visited over Tor at this point +in time?''. We show through simulation that such a \emph{capability}---access to +a WO---greatly reduces the false positive rate for an attacker attempting to +fingerprint the majority of websites and website visits through the Tor network. +The reduction is to such a great extent that our simulations suggest that false +positives are no longer a significant reason for why WF attacks lack real-world +impact. This is in particular the case for onion services where the estimated +number of websites is a fraction compared to the ``regular'' +web~\cite{JansenJGED18}. + +Our simulations are based on the privacy-preserving network measurement results +of the live Tor network in early 2018 by Mani \emph{et~al.}~\cite{torusage}. +Besides simulating WOs we also identify a significant number of potential +sources of WOs that are available to a wide range of attackers, such as nation +state actors, advertisement networks (including their customers), and operators +of Tor relays. Some particularly practical sources---due to DNS +and how onion services are accessed---can be used by anyone with modest +computing resources. + +We argue that sources of WOs are inherent in Tor due to its design goal of +providing \emph{anonymous} and not \emph{unobservable} communication: observable +anonymity sets are inherent for anonymity~\cite{KedoganAP02,anonterm,Raymond00}, +and a WO can be viewed as simply being able to query for membership in the +destination/recipient anonymity set (the potential websites visited by a Tor +client). The solution to the effectiveness of WF+WO attacks is therefore not to +eliminate all sources---that would be impossible without unobservable +communication~\cite{KedoganAP02,anonterm,Raymond00}---but to assume that an +attacker has WO access when evaluating the effectiveness of WF attacks and +defenses, even for weak attackers like local (passive) eavesdroppers. + +The introduction of a WO in the setting of WF attacks is similar to how +encryption schemes are constructed to be secure in the presence of an attacker +with access to \emph{encryption} and \emph{decryption} oracles (chosen plaintext +and ciphertext attacks, respectively) \cite{GoldwasserM84,NaorY90,RackoffS91}. +This is motivated by the real-world prevalence of such oracles, and the high +impact on security when paired with other weaknesses of the encryption schemes: +e.g., Bleichenbacher~\cite{Bleichenbacher98} padding oracle attacks remain an +issue in modern cryptosystems today despite being discovered about twenty years +ago~\cite{Merget19,RonenGGSWY18}. + +\subsection{Contributions and Structure} +Further background on anonymity, Tor, and WF are presented in +Section~\ref{cat:sec:back}. Section~\ref{cat:sec:oracles} defines a WO and describes two +generic constructions for combining a WO with \emph{any} WF attack. Our generic +constructions are a type of Classify-Verify method by Stolerman +\emph{et~al.}~\cite{stolerman2013classify}, first used in the context of WF +attacks by Juarez \emph{et~al.}~\cite{DBLP:conf/ccs/JuarezAADG14} and later by +Greschbach \emph{et~al.} \cite{DBLP:conf/ndss/GreschbachPRWF17}. +Section~\ref{cat:sec:sources} presents a number of sources of WOs that can be used +by a wide range of attackers. We focus on practical sources based on DNS and +onion service directories in Tor, offering \emph{probabilistic} WOs that anyone +can use with modest resources. We describe how we simulate access to a WO +throughout the rest of the paper in Section~\ref{cat:sec:sim}, based on Tor network +measurement data from Mani \emph{et~al.}~\cite{torusage}. + +Section~\ref{cat:sec:wf} experimentally evaluates the performance of augmenting the +state-of-the-art WF attack Deep Fingerprinting (DF) by Sirinam +\emph{et~al.}~\cite{DF} with WO access using one of our generic constructions. +We show significantly improved classification performance against unprotected +Tor as well as against traces defended with the WF defenses WTF-PAD by Juarez +\emph{et~al.}~\cite{wtf-pad} and Walkie-Talkie by Wang and Goldberg~\cite{WT}, +concluding that the defenses are ineffective in this new setting where an +attacker has access to a WO. Further, we also evaluate DF with WO access against +Wang \emph{et~al.}'s dataset~\cite{Wang} with simulated traces for the +constant-rate WF defenses CS-BuFLO and Tamaraw by Cai et +al.~\cite{csbuflo,Tamaraw}. Our results show that constant-rate defenses are +overall effective defenses but not efficient due to the significant induced +overheads. We then evaluate two configurations of the WF defense DynaFlow by Lu +\emph{et~al.}~\cite{DynaFlow}, observing similar effectiveness as CS-BuFLO but +at lower overheads approaching that of WTF-PAD and Walkie-Talkie. + +In Section~\ref{cat:sec:disc} we discuss our results, focusing on the impact on +false positives with WO access, how imperfect sources for WOs impact WF+WO +attacks, limitations of our work, and possible mitigations. Our simulations +indicate that WF defenses should be evaluated against WF attacks based on how +they minimise \emph{recall}. We present related work in +Section~\ref{cat:sec:related}, including how WF+WO attacks relate to traffic +correlation and confirmation attacks. Section~\ref{cat:sec:conc} briefly concludes +this paper. diff --git a/summary/src/cat/src/lessons.tex b/summary/src/cat/src/lessons.tex new file mode 100644 index 0000000..70f49f3 --- /dev/null +++ b/summary/src/cat/src/lessons.tex @@ -0,0 +1,47 @@ +\section{Lessons from Simulation} \label{cat:app:lessons} +With the ability to simulate access to WOs we can now simulate the entire +website anonymity set for Tor. To get a better understanding of why WOs are so +useful for an attacker performing WF attacks, we look at two results from the +simulation below. + +\subsection{Time Until Website Visited over Tor} +\label{cat:sec:sim:timeuntil} + +Figure~\ref{cat:fig:timeuntil} shows the time until there is a 50\% probability that +a website has been visited over Tor depending on website popularity (Alexa, as +discussed in Section~\ref{cat:sec:sim:dist}). Within ten seconds, we expect that +most of Alexa top 1k has been visited. Recall that this represents about one +third of all website visits over Tor. The less popular websites on Alexa top +one-million represent another third of all visits, quickly approaching hundreds +of seconds between visits. For the remaining third of all website visits we +expect them to be even less frequent. + +\begin{figure}[!t] + \centering + \includegraphics[width=.67\columnwidth]{src/cat/img/timeuntilvisited} + \caption{The simulated time until there is a 50\% probability that a website for different Alexa ranks has been visited over Tor.} + \label{cat:fig:timeuntil} +\end{figure} + +\subsection{Visits Until First False Positive} +\label{cat:sec:sim:fp} +Assume that target user(s) have a base rate of $0$, i.e., they never visit the +attacker's monitored websites. With WO access, we can determine how many +(naively assumed independent) website visits it \emph{at least} takes until +there is a 50\% chance that the attacker's classifier gets a false positive. +This is because if the attacker's website classifier without oracle access +always returns a false positive, then the false positive rate by the WF+WO +attack will be determined by when the WO says that the---incorrectly classified +as monitored---website has been visited. Figure~\ref{cat:fig:probfp} shows the +expected number of visits \emph{by the victim(s)} for different timeframes based +on the popularity of the monitored websites. Note that the attacker per +definition chooses which websites are monitored and can therefore take the +probability of false positives into account. + +\begin{figure}[t] + \centering + \includegraphics[width=.67\columnwidth]{src/cat/img/probfp} + \caption{The number of website visits until there is a 50\% probability + that a website oracle would contribute to a false positive.} + \label{cat:fig:probfp} +\end{figure} diff --git a/summary/src/cat/src/main.tex b/summary/src/cat/src/main.tex new file mode 100644 index 0000000..f12287b --- /dev/null +++ b/summary/src/cat/src/main.tex @@ -0,0 +1,121 @@ +\documentclass[USenglish,oneside,twocolumn]{article} + +\usepackage[utf8]{inputenc}%(only for the pdftex engine) +%\RequirePackage[no-math]{fontspec}%(only for the luatex or the xetex engine) +\usepackage[big]{dgruyter_NEW} +\usepackage{subcaption} +\usepackage[dvipsnames]{xcolor} +\usepackage{mathtools} +\usepackage{amsthm, amssymb} +\usepackage[]{cryptocode} +\usepackage{footmisc} +\hypersetup{ + colorlinks, + citecolor=RedViolet, + linkcolor=RedViolet, + urlcolor=MidnightBlue} + +\theoremstyle{definition} +\newtheorem{definition}{Definition} + +\DOI{foobar} + +\newcommand{\TODO}[1]{\textcolor{red}{TODO:} #1} + +\cclogo{\includegraphics{by-nc-nd.pdf}} + +\begin{document} + + \author*[1]{Tobias Pulls} + \author[2]{Rasmus Dahlberg} + \affil[1]{Karlstad University, E-mail: tobias.pulls@kau.se} + \affil[2]{Karlstad University, E-mail: rasmus.dahlberg@kau.se} + + \title{\huge Website Fingerprinting with Website Oracles} + + \runningtitle{Website Fingerprinting with Website Oracles} + + \begin{abstract} + {\input{src/abstract}} + \end{abstract} + \keywords{website fingerprinting, website oracles, traffic analysis, security model, design} +% \classification[PACS]{} +% \communicated{...} +% \dedication{...} + +\journalname{Proceedings on Privacy Enhancing Technologies} +\DOI{Editor to enter DOI} +\startpage{1} +\received{..} +\revised{..} +\accepted{..} + +\journalyear{..} +\journalvolume{..} +\journalissue{..} + +\maketitle +\eject +\section{Introduction} +\label{cat:sec:intro} +\input{src/intro} + +\section{Background} +\label{cat:sec:back} +\input{src/background} + +\section{Website Oracles} +\label{cat:sec:oracles} +\input{src/oracles} + +\section{Sources of Website Oracles} +\label{cat:sec:sources} +\input{src/sources} + +\section{Simulating Website Oracles} +\label{cat:sec:sim} +\input{src/sim} + +\section{Deep Fingerprinting with Website Oracles} +\label{cat:sec:wf} +\input{src/wf} + +\section{Discussion} +\label{cat:sec:disc} +\input{src/discussion} + +\section{Related Work} +\label{cat:sec:related} +\input{src/related} + +\section{Conclusions} +\label{cat:sec:conc} +\input{src/conclusions} + +\section*{Acknowledgements} +We would like to thank Jari Appelgren, Roger Dingledine, Nicholas Hopper, Marc +Juarez, George Kadianakis, Linus Nordberg, Mike Perry, Erik Wästlund, and the +PETS reviewers for their valuable feedback. Simulations were performed using the +\href{http://snic.se/}{Swedish National Infrastructure for Computing} (SNIC) at +\href{https://www.hpc2n.umu.se/}{High Performance Computing Center North} +(HPC2N). This research was funded by the +\href{https://internetstiftelsen.se/en/}{Swedish Internet Foundation} and the +\href{www.kks.se}{Knowledge Foundation of Sweden}. + +\bibliographystyle{abbrv} +\bibliography{ref-min} + +\appendix +\section{Bayes' Law for Estimating Utility of Website Oracles} +\label{cat:app:bayes} +\input{src/bayes} + +\section{Lessons from Simulation} +\label{cat:app:lessons} +\input{src/lessons} + +\section{Sources of Website Oracles} +\label{cat:app:sources} +\input{src/othersources} + +\end{document} diff --git a/summary/src/cat/src/oracles.tex b/summary/src/cat/src/oracles.tex new file mode 100644 index 0000000..01f1d99 --- /dev/null +++ b/summary/src/cat/src/oracles.tex @@ -0,0 +1,126 @@ +\section{Website Oracles} \label{cat:sec:oracles} +We first define a WO and then present two generic constructions for use with WF +attacks based on the kind of output the WF attack supports. + +\subsection{Defining Website Oracles} +\begin{definition} + \label{cat:def:oracle} + A website oracle answers true or false to the question ``was a particular + monitored website $w$ visited over the Tor network at time $t$?''. +\end{definition} + +A WO considers only web\emph{sites} and not web\emph{pages} for $w$, but note +that even for webpage fingerprinting being able to narrow down the possible +websites that webpages belong to through WO access is a significant advantage to +an attacker. The time $t$ refers to a \emph{period of time} or \emph{timeframe} +during which a visit should have taken place. Notably, different sources of WOs +may provide different \emph{resolutions} for time, forcing an attacker to +consider a timeframe in which a visit could have taken place. For example, +timestamps in Apache or nginx access logs use regular Unix timestamps as default +(i.e., seconds), while CDNs like Cloudflare maintain logs with Unix nanosecond +precision. Further, there are inherent limitations in approximating $t$ for the +query when the attacker in addition to WO access can only directly observe +traffic from the victim into Tor. We explore this later in +Section~\ref{cat:sec:sim:timeframe}. + +One important limitation we place on the use of a WO with WF is that the +attacker can only query the WO for \emph{monitored} websites. The open world +setting is intended to capture a more realistic setting for evaluating attacks, +and inherent in this is that the attacker cannot train (or even enumerate) all +possible websites on the web. Given the ability to enumerate and query all +possible websites gives the adversary a capability in line with a global passive +adversary performing correlation attacks, which is clearly outside of the threat +model of Tor~\cite{tor}. We further relate correlation and confirmation attacks +to WF+WO attacks in Section~\ref{cat:sec:related}. + +Definition~\ref{cat:def:oracle} defines the ideal WO: it never fails to observe a +\emph{monitored} website visit, it has no false positives, and it can answer for +an arbitrary $t$. This is similar to how encryption and decryption oracles +always encrypt and decrypt when modelling security for encryption +schemes~\cite{GoldwasserM84,NaorY90,RackoffS91}. In practice, sources of all of +these oracles may be more or less ideal and challenging for an attacker to use. +Nevertheless, the prevalence of sources of these imperfect oracles motivate the +assumption of an attacker with access to an ideal oracle. Similarly, for WOs, we +motivate this assumption in Sections~\ref{cat:sec:sources}~and~\ref{cat:sec:sim}, in +particular wrt.\ a timeframe on the order of (milli)seconds. +Section~\ref{cat:sec:disc} further considers non-ideal sources of WOs and the effect +on WF+WO attacks, both when the WO can produce false positives and when the +source only observes a fraction of visits to monitored websites. + +\subsection{Generic Website Fingerprinting Attacks with Website Oracles} +\label{cat:sec:oracles:generic} +As mentioned in Section~\ref{cat:sec:back:wf}, a WF attack is a classifier that is +given as input a packet trace and provides as output a classification. The +classification is either a monitored site or a class representing unmonitored +(in the open world). Figure~\ref{cat:fig:setting-oracle} shows the setting where an +attacker capable of performing WF attacks also has access to a WO. We define a +generic construction for WF+WO attacks that works with \emph{any} WF attack in +the open world in Definition~\ref{cat:def:oraclewf}: + +\begin{figure}[!t] + \centering + \includegraphics[width=\columnwidth]{src/cat/img/setting-oracle} + \caption{WF+WO attacks, where the WO infers membership of a particular website $w$ in the website anonymity set of all possible websites visited over Tor during a particular timeframe $t$.} + \label{cat:fig:setting-oracle} +\end{figure} + +\begin{definition}[Binary verifier] + \label{cat:def:oraclewf} + Given a website oracle $o$ and WF classification $c$ of a trace collected at + time $t$, if $c$ is a monitored class, query the oracle $o(c,t)$. Return $c$ + if the oracle returns true, otherwise return the unmonitored class. +\end{definition} + +Note that the WO is only queried when the WF \emph{classification} is for a +monitored website and that Definition~\ref{cat:def:oraclewf} is a generalisation of +the ``high precision'' DefecTor attack by Greschbach +\emph{et~al.}~\cite{DBLP:conf/ndss/GreschbachPRWF17}. In terms of +\emph{precision} and \emph{false positives}, the above generic WF+WO +construction is strictly superior to a WF attack without a WO. Assume that the +WF classification incorrectly classified an unmonitored trace as monitored, then +there is \emph{only a probability} that a WO also returns true, depending on the +probability that someone else visited the website in the same timeframe over +Tor. If it does not, then a false positive is prevented. That is, a WF attack +without WO access is identical to a WF attack with access to a useless WO that +always returns true; any improvements beyond that will only help the attacker in +ruling out false positives. We consider the impact on \emph{recall} later. + +We can further refine the use of WOs for the subset of WF attacks that support +providing as output an ordered list of predictions in decreasing likelihood, +optionally with probabilities, as shown in Definition~\ref{cat:def:oracleprob}: + +\begin{definition}[List verifier] + Given an ordered list of predictions in the open world and a website oracle: + + {\centering + \pseudocode[syntaxhighlight=auto]{% + \t\pcfor \text{top prediction $p$ in list} \pcdo \\ + \t\pcind \pcif p \text{ is unmonitored or oracle says $p$ visited } \t\pcthen\\ + \t\pcind[2] return \text{list}\\ + \t\pcind \text{move $p$ to last in list and optionally update probabilities}} + } + \label{cat:def:oracleprob} +\end{definition} + +First, we observe that if the WF attack thinks that it is most likely an +unmonitored website, then we accept that because a WO can only teach us +something new about monitored websites. Secondly, if the most likely prediction +has been visited according to the WO then we also accept that classification +result. Finally, all that is left to do is to consider this while repeatedly +iterating over the top predictions: if the top classification is a monitored +website that has not been visited according to the WO, then move it from the top +of the list and optionally update probabilities (if applicable, then also set +$p=0.0$ before updating) and try again. Per definition, we will either hit the +case of a monitored website that has been visited according to the WO or an +unmonitored prediction. As mentioned in Section~\ref{cat:sec:back:wf}, WF output +that has some sort of probability or threshold associated with classifications +are useful for attackers with different requirements wrt. false positives and +negatives. + +One could consider a third approach based on repeatedly querying a WO to first +determine if any monitored websites have been visited and then train an +optimised classifier (discarding monitored websites that we know have not been +visited). While this may give a minor improvement, our results later in this +paper as well as earlier work show that confusing monitored websites is a minor +issue compared to confusing an unmonitored website as +monitored~\cite{DBLP:conf/ndss/GreschbachPRWF17,DBLP:conf/ccs/JuarezAADG14,Wang}. diff --git a/summary/src/cat/src/othersources.tex b/summary/src/cat/src/othersources.tex new file mode 100644 index 0000000..35d3c71 --- /dev/null +++ b/summary/src/cat/src/othersources.tex @@ -0,0 +1,112 @@ +\section{Sources of Website Oracles} \label{cat:app:sources} +There are a wide number of possible sources to instantiate WOs. Here we present +some details on a selection of sources, far from exhaustive. + +\subsection{Surveillance Programmes} +Intelligence agencies operate surveillance programmes that perform bulk +collection and retention of communications metadata, including +web-browsing~\cite{lyon2014surveillance}. For example, the Snowden revelations +included \emph{Marina}: +\begin{quote} + Of the more distinguishing features, Marina has the ability to look back on + the last 365 days' worth of DNI (Digital Network Intelligence) metadata seen + by the Sigint collection system, \emph{regardless} whether or not it was + tasked for collection~\cite{guardian}. +\end{quote} + +Another example is the prevalence of nation states to monitor Internet traffic +that crosses geographic borders. For example, China operates the Great Firewall +of China that is also used for censorship purposes. Due to the nature of Tor and +how exits are selected, visits to websites that are not operated by world-wide +reaching hosting providers are highly likely to cross multiple nation borders as +traffic goes from an exit to the website. It is also worth to highlight that any +sensitive website hosted from within a country where a state actor is interested +in identifying visitors are likely to capture traffic to that website due to the +Tor traffic crossing its borders more often than not. + +\subsection{Content Delivery Networks} +Content Delivery Networks (CDNs), such as Akamai, Google, and Amazon host +different types of content for a significant fraction of all websites on the +Internet~\cite{ScheitleHGJZSV18}. Inherently, all requests for these resources +are easily identified as coming from Tor exits, and depending on content, things +like unique identifiers and HTTP referrer headers enable the CDN provider to +infer the website the content is hosted on. + +\subsection{Internet Giants} +Internet giants like Google, Apple, Facebook, Amazon, Microsoft, and Cloudflare +make up a large fraction the web as we know it. For example the use of Google +Analytics is wide-spread, so is hosting in clouds provided by several of these +giants, and Cloudflare with its ``cloud network platform'' hosts over 13 million +domains~\cite{cf-size}. +While some of them may do what is in their power to protect the valuable data +they process and retain, they are still subject to many legal frameworks across +the world that might not offer the best of protections for, say, access logs +pertaining to ``anonymous'' users of Tor when requested by authorities of nation +states. As another example, Cloudflare offers a nice API for their customers to +get their access logs with Unix nanosecond precision. The logs are retained for +up to seven +days~\cite{cf-retention}, +giving ample time for legal requests. + +\subsection{Access Logs of Web Servers} +The vast majority of web servers retain access logs by default. Typically, they +provide unix timestamps with seconds as the resolution (the case for Apache and +nginx). Further, the access logs may be shipped to centralised security +information and event management (SIEM) systems for analysis, with varying +retention times and rigour in storage. For example, it is common to +``anonymize'' logs by removing parts of the IP-addresses and then retaining them +indefinitely, as is the case for Google who removes part of IP addresses in logs +after nine +months~\cite{google-retention}. + + +\subsection{Middleboxes} +Network middleboxes that observe, analyse, and potentially retain network +traffic abound. Especially in more oppressive countries, middleboxes are often +used for censorship or dragnet surveillance, e.g., as seen with Blue Coat in +Syria~\cite{bluecoat}. + +\subsection{OCSP Responders} +Chung \emph{et~al.}~\cite{ocsp-chung} found in a recent study that 95.4\% of all +certificates support the Online Certificate Status Protocol (OCSP), which allows +a client to query the responsible CA in real-time for a certificate's revocation +status via HTTP. As such, the browsed website will be exposed to the CA in +question. From a privacy-standpoint this could be solved if the server +\emph{stapled} a recently fetched OCSP response with the served certificate. +Unfortunately, only 35\% of Alexa's top-one-million uses OCSP +stapling~\cite{ocsp-chung}. + +Unless an OCSP response is stapled while visiting a website in a default +configuration of the Tor browser, the status of a certificate is checked in +real-time using OCSP. As such, any CA that issued a certificate for a website +without OCSP stapling could instantiate a WO with an RTT-based resolution. +Similarly, any actor that observes most OCSP traffic (which is in plaintext due +to HTTP) gets the same capability. To better understand who could instantiate a +WO based on OCSP we performed preliminary traceroute measurements\footnote{% + Every RIPE Atlas probe used its configured DNS resolver(s). In total we + requested 2048 WW-probes for one-off measurements. +} on the RIPE Atlas network towards four OCSP +responders that are hosted by particularly large CAs: Let's Encrypt, Sectigo, +DigiCert, and GoDaddy. Let's Encrypt and Sectigo are fronted by a variety of +actors (mainly due to CDN caching), while DigiCert is fronted by a single CDN. +Requests towards GoDaddy's OCSP responder always end-up in an AS hosted by +GoDaddy. + +\subsection{Tor Exit Relays} +Anyone can run a Tor exit relay and have it be used by all Tor users. Obviously, +the operator of the exit relay can observe when its relay is used and the +destination websites. At the time of writing, the consumed exit bandwidth of the +entire Tor network is around 50~Gbit/s. This makes the necessary investment for +an attacker that wishes to get a decent chunk of exit bandwidth more a question +of stealthily deploying new exit relays than prohibitively large monetary costs. + +\subsection{Information Leaks} +More sophisticated attackers can look for information leaks at the application, +network, and operating system levels that allow them to infer that websites have +been visited. Application level information leaks are particularly of concern +for onion services: any observable state that can be tied to a new visitor is a +WO for an onion visit (this is not the case for ``regular'' websites). Such +state can include online status or the number of online users of a service, any +observable activity with timestamps, a predictable caching structure, and so on. +Similar information leaks can also occur on the network and operating system +level~\cite{DBLP:journals/ton/CaoQWDKM18,DBLP:conf/uss/EnsafiPKC10,DBLP:conf/ccs/QianMX12}. diff --git a/summary/src/cat/src/ref-min.bib b/summary/src/cat/src/ref-min.bib new file mode 100644 index 0000000..fe12c0e --- /dev/null +++ b/summary/src/cat/src/ref-min.bib @@ -0,0 +1,837 @@ +@misc{google-retention, + author = {Google LLC.}, + title = {How {Google} retains data we collect}, + howpublished = {\url{https://web.archive.org/web/20190227170903/https://policies.google.com/technologies/retention}}, +} + +@misc{cf-retention, + author = {Cloudflare Inc.}, + title = {{FAQs}}, + howpublished = {\url{https://web.archive.org/web/20190227165850/https://developers.cloudflare.com/logs/faq/}}, +} + +@misc{cf-size, + author = {Cloudflare Inc.}, + title = {Helping Build a Better Internet}, + howpublished = {\url{https://web.archive.org/web/20190227165133/https://www.cloudflare.com/}}, +} + +@misc{guardian, + author = {The Guardian}, + title = {{NSA} stores metadata of millions of web users for up to a year, secret files show}, + howpublished = {\url{https://www.theguardian.com/world/2013/sep/30/nsa-americans-metadata-year-documents}, accessed 2019-02-27}, +} + +@misc{wiki, + author = {Wikipedia contributors}, + title = {Softmax function---{Wikipedia}{,} the free encyclopedia.}, + howpublished = {\url{https://en.wikipedia.org/w/index.php?title=Softmax_function&oldid=883834589}, accessed 2019-02-17}, +} + +@misc{alexa, + author = {Amazon}, + title = {The top 500 sites on the web}, + howpublished = {\url{https://www.alexa.com/topsites}, accessed 2019-02-13}} +} + +@misc{tor-safety-board, + author = {Tor Project}, + title = {Tor Research Safety Board}, + howpublished = {\url{https://research.torproject.org/safetyboard.html}, accessed 2019-02-13}, +} + +@misc{google-bid-anon, + author = {Google LLC.}, + title = {Set your desktop and mobile web inventory to Anonymous, Branded, or Semi-transparent in {AdX}}, + howpublished = {\url{https://web.archive.org/web/20190228123602/https://support.google.com/admanager/answer/2913411?hl=en&ref_topic=2912022}}, +} + +@misc{google-bid, + author = {Google LLC.}, + title = {Real-Time Bidding Protocol Buffer v.161 }, + howpublished = {\url{https://web.archive.org/web/20190228122615/https://developers.google.com/authorized-buyers/rtb/downloads/realtime-bidding-proto}}, +} + +@misc{google-dn, + author = {Google LLC.}, + title = {About targeting for Display Network campaigns}, + howpublished = {\url{https://web.archive.org/web/20190228122431/https://support.google.com/google-ads/answer/2404191?hl=en&ref\_topic=3121944\%5C}}, +} + +@misc{google-purge, + author = {Google LLC.}, + title = {Flush Cache}, + howpublished = {\url{https://web.archive.org/web/20190228150306/https://developers.google.com/speed/public-dns/cache}}, +} + +@misc{cf-purge, + author = {Cloudflare Inc.}, + title = {Purge Cache}, + howpublished = {\url{https://web.archive.org/web/20190228150344/https://1.1.1.1/purge-cache/}}, +} + +@misc{bug-report, + author = {Tobias Pulls and Rasmus Dahlberg}, + title = {{OOM} manger wipes entire {DNS} cache}, + howpublished = {\url{https://trac.torproject.org/projects/tor/ticket/29617}}, + year = {2020}, +} + +@misc{tor-0401, + author = {Nick Mathewson}, + title = {New Release: {Tor} 0.4.0.1-alpha }, + howpublished = {\url{https://blog.torproject.org/new-release-tor-0401-alpha}, accessed 2019-02-08}, +} + +@misc{netcraft-survey, + author = {Netcraft}, + title = {January 2019 Web Server Survey}, + howpublished = {\url{https://web.archive.org/web/20190208081915/https://news.netcraft.com/archives/category/web-server-survey/}} +} + +@inproceedings{DBLP:conf/ccs/JuarezAADG14, + author = {Marc Ju{\'{a}}rez and + Sadia Afroz and + Gunes Acar and + Claudia D{\'{\i}}az and + Rachel Greenstadt}, + title = {A Critical Evaluation of Website Fingerprinting Attacks}, + booktitle = {{CCS}}, + year = {2014} +} + +@article{chow1970optimum, + title={On optimum recognition error and reject tradeoff}, + author={Chow, C}, + journal={IEEE Trans. Inf. Theory}, + volume={16}, + number={1}, + year={1970}, +} + +@inproceedings{stolerman2013classify, + title={Classify, but verify: Breaking the closed-world assumption in stylometric authorship attribution}, + author={Stolerman, Ariel and Overdorf, Rebekah and Afroz, Sadia and Greenstadt, Rachel}, + booktitle={IFIP Working Group}, + volume={11}, + year={2013} +} + +@inproceedings{DBLP:conf/ndss/GreschbachPRWF17, + author = {Benjamin Greschbach and + Tobias Pulls and + Laura M. Roberts and + Phillip Winter and + Nick Feamster}, + title = {The Effect of {DNS} on {Tor}'s Anonymity}, + booktitle = {{NDSS}}, + year = {2017}, +} + +@article{DBLP:journals/popets/WangG16, + author = {Tao Wang and + Ian Goldberg}, + title = {On Realistically Attacking {Tor} with Website Fingerprinting}, + journal = {PETS}, + volume = {2016}, + number = {4}, +} + +@inproceedings{DBLP:conf/uss/KwonALDD15, + author = {Albert Kwon and + Mashael AlSabah and + David Lazar and + Marc Dacier and + Srinivas Devadas}, + title = {Circuit Fingerprinting Attacks: Passive Deanonymization of {Tor} Hidden + Services}, + booktitle = {{USENIX} Security}, + year = {2015}, +} + +@inproceedings{DBLP:conf/wpes/PanchenkoMHLWE17, + author = {Andriy Panchenko and + Asya Mitseva and + Martin Henze and + Fabian Lanze and + Klaus Wehrle and + Thomas Engel}, + title = {Analysis of Fingerprinting Techniques for {Tor} Hidden Services}, + booktitle = {{WPES}}, + year = {2017}, +} + +@inproceedings{jansenccs18, + author = {Rob Jansen and Matthew Traudt and Nick Hopper}, + title = {Privacy-Preserving Dynamic Learning of {Tor} Network Traffic}, + booktitle = {{CCS}}, + year = {2018} +} + +@inproceedings{DBLP:conf/ccs/JansenJ16, + author = {Rob Jansen and + Aaron Johnson}, + title = {Safely Measuring {Tor}}, + booktitle = {{CCS}}, + year = {2016}, +} + +@inproceedings{DBLP:conf/wpes/VinesRK17, + author = {Paul Vines and + Franziska Roesner and + Tadayoshi Kohno}, + title = {Exploring {ADINT:} Using Ad Targeting for Surveillance on a Budget + - or - How Alice Can Buy Ads to Track Bob}, + booktitle = {{WPES}}, + year = {2017}, +} + +@article{riggingranking, + author = {Victor Le Pochat and + Tom van Goethem and + Wouter Joosen}, + title = {Rigging Research Results by Manipulating Top Websites Rankings}, + journal = {CoRR}, + volume = {abs/1806.01156}, + year = {2018}, +} + +@inproceedings{torusage, + author = {Akshaya Mani and + T. Wilson{-}Brown and + Rob Jansen and + Aaron Johnson and + Micah Sherr}, + title = {Understanding {Tor} Usage with Privacy-Preserving Measurement}, + booktitle = {{IMC}}, + year = {2018} +} + +@article{lyon2014surveillance, + title={Surveillance, {Snowden}, and big data: Capacities, consequences, critique}, + author={Lyon, David}, + journal={Big Data \& Society}, + volume={1}, + number={2}, + year={2014}, + publisher={SAGE Publications Sage UK: London, England} +} + +@inproceedings{Wang, + author = {Tao Wang and + Xiang Cai and + Rishab Nithyanand and + Rob Johnson and + Ian Goldberg}, + title = {Effective Attacks and Provable Defenses for Website Fingerprinting}, + booktitle = {{USENIX} Security}, + year = {2014}, +} + +@article{Cherubin17, + author = {Giovanni Cherubin}, + title = {Bayes, not Na{\"{\i}}ve: Security Bounds on Website Fingerprinting Defenses}, + journal = {PETS}, + volume = {2017}, + number = {4}, +} + +@inproceedings{Tamaraw, + author = {Xiang Cai and + Rishab Nithyanand and + Tao Wang and + Rob Johnson and + Ian Goldberg}, + title = {A Systematic Approach to Developing and Evaluating Website Fingerprinting + Defenses}, + booktitle = {{CCS}}, + year = {2014}, +} + +@inproceedings{csbuflo, + author = {Xiang Cai and + Rishab Nithyanand and + Rob Johnson}, + title = {{CS-BuFLO}: {A} Congestion Sensitive Website Fingerprinting Defense}, + booktitle = {{WPES}}, + year = {2014}, +} + +@inproceedings{DF, + author = {Payap Sirinam and + Mohsen Imani and + Marc Ju{\'{a}}rez and + Matthew Wright}, + title = {Deep Fingerprinting: Undermining Website Fingerprinting Defenses with + Deep Learning}, + booktitle = {{CCS}}, + year = {2018} +} + +@inproceedings{wtf-pad, + author = {Marc Ju{\'{a}}rez and + Mohsen Imani and + Mike Perry and + Claudia D{\'{\i}}az and + Matthew Wright}, + title = {Toward an Efficient Website Fingerprinting Defense}, + booktitle = {{ESORICS}}, + year = {2016} +} + +@inproceedings{WT, + author = {Tao Wang and + Ian Goldberg}, + title = {Walkie-Talkie: An Efficient Defense Against Passive Website Fingerprinting + Attacks}, + booktitle = {{USENIX} Security}, + year = {2017} +} + +@inproceedings{DynaFlow, + author = {David Lu and + Sanjit Bhat and + Albert Kwon and + Srinivas Devadas}, + title = {DynaFlow: An Efficient Website Fingerprinting Defense Based on Dynamically-Adjusting + Flows}, + booktitle = {{WPES}}, + year = {2018} +} + +@inproceedings{tor, + author = {Roger Dingledine and + Nick Mathewson and + Paul F. Syverson}, + title = {Tor: The Second-Generation Onion Router}, + booktitle = {{USENIX} Security}, + year = {2004} +} + +@inproceedings{trilemma, + author = {Debajyoti Das and + Sebastian Meiser and + Esfandiar Mohammadi and + Aniket Kate}, + title = {Anonymity Trilemma: Strong Anonymity, Low Bandwidth Overhead, Low + Latency - Choose Two}, + booktitle = {{IEEE} {S\&P}}, + year = {2018}, +} + +@inproceedings{SunSWRPQ02, + author = {Qixiang Sun and + Daniel R. Simon and + Yi{-}Min Wang and + Wilf Russell and + Venkata N. Padmanabhan and + Lili Qiu}, + title = {Statistical Identification of Encrypted Web Browsing Traffic}, + booktitle = {{IEEE S\&P}}, + year = {2002} +} + +@article{rtb, + author = {Jun Wang and + Weinan Zhang and + Shuai Yuan}, + title = {Display Advertising with Real-Time Bidding {(RTB)} and Behavioural + Targeting}, + journal = {Foundations and Trends in Information Retrieval}, + year = {2017} +} + +@inproceedings{ocsp-chung, + author = {Taejoong Chung and + Jay Lok and + Balakrishnan Chandrasekaran and + David R. Choffnes and + Dave Levin and + Bruce M. Maggs and + Alan Mislove and + John P. Rula and + Nick Sullivan and + Christo Wilson}, + title = {Is the Web Ready for {OCSP} Must-Staple?}, + booktitle = {{IMC}}, + year = {2018} +} + +@inproceedings{bluecoat, + author = {Chaabane Abdelberi and + Terence Chen and + Mathieu Cunche and + Emiliano De Cristofaro and + Arik Friedman and + Mohamed Ali K{\^{a}}afar}, + title = {Censorship in the Wild: Analyzing Internet Filtering in {Syria}}, + booktitle = {{IMC}}, + year = {2014} +} + +@inproceedings{PanchenkoNZE11, + author = {Andriy Panchenko and + Lukas Niessen and + Andreas Zinnen and + Thomas Engel}, + title = {Website fingerprinting in onion routing based anonymization networks}, + booktitle = {{WPES}}, + year = {2011} +} + +@inproceedings{Hintz02, + author = {Andrew Hintz}, + title = {Fingerprinting Websites Using Traffic Analysis}, + booktitle = {{PETS}}, + year = {2002} +} + +@inproceedings{HerrmannWF09, + author = {Dominik Herrmann and + Rolf Wendolsky and + Hannes Federrath}, + title = {Website fingerprinting: attacking popular privacy enhancing technologies + with the multinomial na{\"{\i}}ve-bayes classifier}, + booktitle = {{CCSW}}, + year = {2009}, +} + +@inproceedings{kfp, + author = {Jamie Hayes and + George Danezis}, + title = {k-fingerprinting: {A} Robust Scalable Website Fingerprinting Technique}, + booktitle = {{USENIX} Security}, + year = {2016}, +} + +@inproceedings{CaiZJJ12, + author = {Xiang Cai and + Xin Cheng Zhang and + Brijesh Joshi and + Rob Johnson}, + title = {Touching from a distance: website fingerprinting attacks and defenses}, + booktitle = {{CCS}}, + year = {2012}, +} + +@inproceedings{DBLP:conf/esorics/ShmatikovW06, + author = {Vitaly Shmatikov and + Ming{-}Hsiu Wang}, + title = {Timing Analysis in Low-Latency Mix Networks: Attacks and Defenses}, + booktitle = {{ESORICS}}, + year = {2006}, +} + +@misc{anonterm, + title={A terminology for talking about privacy by data minimization: Anonymity, unlinkability, undetectability, unobservability, pseudonymity, and identity management}, + author={Pfitzmann, Andreas and Hansen, Marit}, + publisher={Dresden, Germany}, + year={2010}, +} + +@inproceedings{JansenJGED18, + author = {Rob Jansen and + Marc Ju{\'{a}}rez and + Rafa Galvez and + Tariq Elahi and + Claudia D{\'{\i}}az}, + title = {Inside Job: Applying Traffic Analysis to Measure {Tor} from Within}, + booktitle = {{NDSS}}, + year = {2018} +} + +@article{GoldwasserM84, + author = {Shafi Goldwasser and + Silvio Micali}, + title = {Probabilistic Encryption}, + journal = {JCSS}, + volume = {28}, + number = {2}, + year = {1984}, +} + +@inproceedings{NaorY90, + author = {Moni Naor and + Moti Yung}, + title = {Public-key Cryptosystems Provably Secure against Chosen Ciphertext + Attacks}, + booktitle = {Proc. Annu. ACM Symp. Theory Comput.}, + year = {1990} +} + +@inproceedings{RackoffS91, + author = {Charles Rackoff and + Daniel R. Simon}, + title = {Non-Interactive Zero-Knowledge Proof of Knowledge and Chosen Ciphertext + Attack}, + booktitle = {{CRYPTO}}, + year = {1991} +} + +@inproceedings{Bleichenbacher98, + author = {Daniel Bleichenbacher}, + title = {Chosen Ciphertext Attacks Against Protocols Based on the {RSA} Encryption + Standard {PKCS} {\#}1}, + booktitle = {{CRYPTO}}, + year = {1998} +} + +@article{RonenGGSWY18, + author = {Eyal Ronen and + Robert Gillham and + Daniel Genkin and + Adi Shamir and + David Wong and + Yuval Yarom}, + title = {The 9 Lives of {Bleichenbacher's} {CAT:} New Cache ATtacks on {TLS} + Implementations}, + journal = {{IACR} Cryptology ePrint Archive}, + year = {2018}, +} + +@inproceedings{DBLP:conf/sp/DyerCRS12, + author = {Kevin P. Dyer and + Scott E. Coull and + Thomas Ristenpart and + Thomas Shrimpton}, + title = {Peek-a-Boo, {I} Still See You: Why Efficient Traffic Analysis Countermeasures + Fail}, + booktitle = {{IEEE} {S\&P}}, + year = {2012} +} + +@inproceedings{DBLP:conf/wpes/NithyanandCJ14, + author = {Rishab Nithyanand and + Xiang Cai and + Rob Johnson}, + title = {Glove: {A} Bespoke Website Fingerprinting Defense}, + booktitle = {{WPES}}, + year = {2014} +} + +@inproceedings{mathews2018understanding, + title={UNDERSTANDING FEATURE DISCOVERY IN WEBSITE FINGERPRINTING ATTACKS}, + author={Mathews, Nate and Sirinam, Payap and Wright, Matthew}, + booktitle={{WNYISPW}}, + year={2018}, +} + +@article{abe2016fingerprinting, + title={Fingerprinting attack on {Tor} anonymity using deep learning}, + author={Abe, Kota and Goto, Shigeki}, + journal={Proceedings of the Asia-Pacific Advanced Network}, + volume={42}, + year={2016} +} + +@inproceedings{DBLP:conf/ndss/RimmerPJGJ18, + author = {Vera Rimmer and + Davy Preuveneers and + Marc Ju{\'{a}}rez and + Tom van Goethem and + Wouter Joosen}, + title = {Automated Website Fingerprinting through Deep Learning}, + booktitle = {{NDSS}}, + year = {2018} +} + +@inproceedings{cumul, + author = {Andriy Panchenko and + Fabian Lanze and + Jan Pennekamp and + Thomas Engel and + Andreas Zinnen and + Martin Henze and + Klaus Wehrle}, + title = {Website Fingerprinting at Internet Scale}, + booktitle = {{NDSS}}, + year = {2016} +} + +@article{cheng1998traffic, + title={Traffic analysis of {SSL} encrypted web browsing}, + author={Cheng, Heyning and Avnur, Ron}, + journal={Project paper, University of Berkeley}, + year={1998} +} + +@inproceedings{DBLP:conf/sp/SunSWRPQ02, + author = {Qixiang Sun and + Daniel R. Simon and + Yi{-}Min Wang and + Wilf Russell and + Venkata N. Padmanabhan and + Lili Qiu}, + title = {Statistical Identification of Encrypted Web Browsing Traffic}, + booktitle = {{IEEE S\&P}}, + year = {2002} +} + +@inproceedings{DBLP:conf/ccs/LiberatoreL06, + author = {Marc Liberatore and + Brian Neil Levine}, + title = {Inferring the source of encrypted {HTTP} connections}, + booktitle = {{CCS}}, + year = {2006} +} + +@inproceedings{KedoganAP02, + author = {Dogan Kesdogan and + Dakshi Agrawal and + Stefan Penz}, + title = {Limits of Anonymity in Open Environments}, + booktitle = {{IH}}, + year = {2002} +} + +@inproceedings{DBLP:conf/pet/Danezis04, + author = {George Danezis}, + title = {The Traffic Analysis of Continuous-Time Mixes}, + booktitle = {{PETS}}, + year = {2004} +} + +@inproceedings{DBLP:conf/ih/DanezisS04, + author = {George Danezis and + Andrei Serjantov}, + title = {Statistical Disclosure or Intersection Attacks on Anonymity Systems}, + booktitle = {{IH}}, + year = {2004} +} + +@inproceedings{DBLP:conf/diau/BertholdPS00, + author = {Oliver Berthold and + Andreas Pfitzmann and + Ronny Standtke}, + title = {The Disadvantages of Free {MIX} Routes and how to Overcome Them}, + booktitle = {International Workshop on Design Issues in Anonymity and Unobservability}, + year = {2000}, +} + +@inproceedings{KesdoganP04, + author = {Dogan Kesdogan and + Lexi Pimenidis}, + title = {The Hitting Set Attack on Anonymity Protocols}, + booktitle = {{IH}}, + year = {2004} +} + +@inproceedings{TroncosoGPV08, + author = {Carmela Troncoso and + Benedikt Gierlichs and + Bart Preneel and + Ingrid Verbauwhede}, + title = {Perfect Matching Disclosure Attacks}, + booktitle = {{PETS}}, + year = {2008} +} + +@inproceedings{DiazSCP02, + author = {Claudia D{\'{\i}}az and + Stefaan Seys and + Joris Claessens and + Bart Preneel}, + title = {Towards Measuring Anonymity}, + booktitle = {{PETS}}, + year = {2002} +} + +@inproceedings{SerjantovD02, + author = {Andrei Serjantov and + George Danezis}, + title = {Towards an Information Theoretic Metric for Anonymity}, + booktitle = {{PETS}}, + year = {2002} +} + +@inproceedings{Raymond00, + author = {Jean{-}Fran{\c{c}}ois Raymond}, + title = {Traffic Analysis: Protocols, Attacks, Design Issues, and Open Problems}, + booktitle = {International Workshop on Design Issues in Anonymity and Unobservability}, + year = {2000} +} + +@inproceedings{Danezis03, + author = {George Danezis}, + title = {Statistical Disclosure Attacks}, + booktitle = {{IFIP SEC}}, + year = {2003} +} + +@inproceedings{MurdochD05, + author = {Steven J. Murdoch and + George Danezis}, + title = {Low-Cost Traffic Analysis of {Tor}}, + booktitle = {{IEEE} {S\&P}}, + year = {2005} +} + +@inproceedings{ChakravartySK10, + author = {Sambuddho Chakravarty and + Angelos Stavrou and + Angelos D. Keromytis}, + title = {Traffic Analysis against Low-Latency Anonymity Networks Using Available + Bandwidth Estimation}, + booktitle = {{ESORICS}}, + year = {2010} +} + +@inproceedings{MittalKJCB11, + author = {Prateek Mittal and + Ahmed Khurshid and + Joshua Juen and + Matthew Caesar and + Nikita Borisov}, + title = {Stealthy traffic analysis of low-latency anonymous communication using + throughput fingerprinting}, + booktitle = {{CCS}}, + year = {2011} +} + +@inproceedings{deepcorr, + author = {Milad Nasr and + Alireza Bahramali and + Amir Houmansadr}, + title = {DeepCorr: Strong Flow Correlation Attacks on {Tor} Using Deep Learning}, + booktitle = {{CCS}}, + year = {2018} +} + +@inproceedings{JohnsonWJSS13, + author = {Aaron Johnson and + Chris Wacek and + Rob Jansen and + Micah Sherr and + Paul F. Syverson}, + title = {Users get routed: traffic correlation on {Tor} by realistic adversaries}, + booktitle = {{CCS}}, + year = {2013} +} + +@inproceedings{BorisovDMT07, + author = {Nikita Borisov and + George Danezis and + Prateek Mittal and + Parisa Tabriz}, + title = {Denial of service or denial of security?}, + booktitle = {{CCS}}, + year = {2007} +} + +@inproceedings{SunEVLRCM15, + author = {Yixin Sun and + Anne Edmundson and + Laurent Vanbever and + Oscar Li and + Jennifer Rexford and + Mung Chiang and + Prateek Mittal}, + title = {{RAPTOR:} Routing Attacks on Privacy in {Tor}}, + booktitle = {{USENIX} Security}, + year = {2015} +} + +@inproceedings{ScheitleHGJZSV18, + author = {Quirin Scheitle and + Oliver Hohlfeld and + Julien Gamba and + Jonas Jelten and + Torsten Zimmermann and + Stephen D. Strowes and + Narseo Vallina{-}Rodriguez}, + title = {A Long Way to the Top: Significance, Structure, and Stability of Internet + Top Lists}, + booktitle = {{IMC}}, + year = {2018} +} + +@article{DBLP:journals/ton/CaoQWDKM18, + author = {Yue Cao and + Zhiyun Qian and + Zhongjie Wang and + Tuan Dao and + Srikanth V. Krishnamurthy and + Lisa M. Marvel}, + title = {Off-Path {TCP} Exploits of the Challenge {ACK} Global Rate Limit}, + journal = {{IEEE/ACM} Trans. Netw.}, + volume = {26}, + number = {2}, + year = {2018} +} + +@inproceedings{DBLP:conf/ccs/QianMX12, + author = {Zhiyun Qian and + Zhuoqing Morley Mao and + Yinglian Xie}, + title = {Collaborative {TCP} sequence number inference attack: how to crack + sequence number under a second}, + booktitle = {{CCS}}, + year = {2012} +} + +@inproceedings{DBLP:conf/uss/EnsafiPKC10, + author = {Roya Ensafi and + Jong Chun Park and + Deepak Kapur and + Jedidiah R. Crandall}, + title = {Idle Port Scanning and Non-interference Analysis of Network Protocol + Stacks Using Model Checking}, + booktitle = {{USENIX} Security}, + year = {2010} +} + +@misc{onionv2, + author = {{Tor Project}}, + title = {{Tor} Rendezvous Specification - Version 2}, + howpublished = {\url{https://gitweb.torproject.org/torspec.git/tree/rend-spec-v2.txt}, accessed 2019-02-13}, +} + +@misc{onionv3, + author = {{Tor Project}}, + title = {{Tor} Rendezvous Specification - Version 3}, + howpublished = {\url{https://gitweb.torproject.org/torspec.git/tree/rend-spec-v3.txt}, accessed 2019-02-13}, +} + +@inproceedings{Merget19, + author = {Robert Merget and Juraj Somorovsky and Nimrod Aviram and Craig Young and Janis Fliegenschmidt and Jörg Schwenk and Yuval Shavitt}, + title = {Scalable Scanning and Automatic Classification of {TLS} Padding Oracle Vulnerabilities}, + booktitle = {{USENIX} Security}, + year = {2019}, + note = {to appear} +} + +@inproceedings{DBLP:conf/pet/WinterKMHSLW14, + author = {Philipp Winter and + Richard K{\"{o}}wer and + Martin Mulazzani and + Markus Huber and + Sebastian Schrittwieser and + Stefan Lindskog and + Edgar R. Weippl}, + title = {Spoiled Onions: Exposing Malicious {Tor} Exit Relays}, + booktitle = {{PETS}}, + year = {2014} +} + +@phdthesis{Wang2015a, + author = {Tao Wang}, + title = {Website Fingerprinting: Attacks and Defenses}, + school = {University of Waterloo}, + year = {2015}, + howpublished = {\url{https://nymity.ch/tor-dns/pdf/Wang2015a.pdf}}, +} + +@misc{perryCrit, + author = {Mike Perry}, + title = {A Critique of Website Traffic Fingerprinting Attacks}, + howpublished = {\url{https://blog.torproject.org/critique-website-traffic-fingerprinting-attacks}, accessed 2019-02-08}, +} + +@article{DBLP:journals/jsac/ReedSG98, + author = {Michael G. Reed and Paul F. Syverson and David M. Goldschlag}, + title = {Anonymous connections and onion routing}, + journal = {{JSAC}}, + volume = {16}, + number = {4}, + year = {1998}, +} diff --git a/summary/src/cat/src/related.tex b/summary/src/cat/src/related.tex new file mode 100644 index 0000000..6c36654 --- /dev/null +++ b/summary/src/cat/src/related.tex @@ -0,0 +1,64 @@ +\section{Related Work} \label{cat:sec:related} +The combination of a WF attack with a WO is a type of Classify-Verify method as +proposed by Stolerman et al.~\cite{stolerman2013classify}, which in turn is a +type of rejection function as described by Chow~\cite{chow1970optimum}. Such a +method was first used in the context of WF by Juarez +\emph{et~al.}~\cite{DBLP:conf/ccs/JuarezAADG14} and later by Greschbach +\emph{et~al.} \cite{DBLP:conf/ndss/GreschbachPRWF17} to augment WF attacks with +inferences from observed DNS traffic. Note that the attack by Greschbach et al. +can be seen as a probabilistic WO due to the attacker under their threat model +only observing a fraction of DNS traffic from the Tor network. Our work builds +upon and generalises their work where DNS traffic is just one of many possible +sources to infer website visits from. Further, our DNS-based sources are usable +by anyone instead of relatively strong network attackers (or Google or +Cloudflare). + +All anonymity networks produce anonymity sets (per definition) that change with +observations by an attacker over time~\cite{Raymond00}. Modelling the behaviour +of an anonymity system (as a mix), what the attacker observes, and how the +anonymity sets change over time allows us to reason about how the attacker can +perform traffic analysis and break the anonymity provided by the +system~\cite{DiazSCP02,KedoganAP02,SerjantovD02}. Attacks along these lines are +many with more-or-less consistent terminology, including intersection attacks, +(statistical) disclosure attacks, and traffic confirmation +attacks~\cite{DBLP:conf/diau/BertholdPS00,Danezis03, +DBLP:conf/pet/Danezis04,DBLP:conf/ih/DanezisS04,KesdoganP04,Raymond00, +DBLP:journals/jsac/ReedSG98,TroncosoGPV08}. + +WOs are nothing more than applying the notion of anonymity sets to the potential +destination websites visited over an anonymity network like Tor and giving an +attacker the ability to query this anonymity set for membership for a limited +number of monitored websites. The way we use WOs in our generic attacks is +\emph{not to learn long-term statistically unlikely relationships} between +senders and recipients in a network. Rather, the WO is only used to learn +\emph{part of the anonymity set at the time of the attack}. That an attacker can +observe anonymity sets is not novel, what is novel in our work is how we apply +it to the WF domain and argue for its inclusion as a core attacker capability +when modelling WF attacks and defenses. + +Murdoch and Danezis showed how to use observed latency in Tor as an oracle to +perform traffic analysis attacks \cite{MurdochD05}. Chakravarty \emph{et~al.} +detailed similar attacks but based on bandwidth estimation +\cite{ChakravartySK10} and Mittal \emph{et~al.} using throughput +estimation~\cite{MittalKJCB11}. Attackers in these cases do not need to be +directly in control of significant fractions Tor, but rather use network +measurements to infer the state of the network and create an oracle that an +attacker can utilize, similar to WOs. + +Correlation of input and output flows is at the core of many attacks on +anonymity networks like Tor~\cite{BorisovDMT07,JohnsonWJSS13,SunEVLRCM15}. Flow +correlation attacks correlate traffic on the network layer, considering packet +sizes and timing of sent traffic. The RAPTOR attack by Sun et +al.~\cite{SunEVLRCM15} needs about 100MB of data sent over five minutes to +correlate flows with high accuracy. The recent state-of-the-art attack DeepCorr +by Nasr \emph{et~al.} \cite{deepcorr}---based on deep learning like Deep +Fingerprinting by Sirinam \emph{et~al.}~\cite{DF}---needs only about 900KB of +data (900 packets) for comparable accuracy to RAPTOR. While flow correlation +attacks like RAPTOR and DeepCorr operate on the network layer, WF+WO attacks can +be viewed as \emph{application layer} correlation attacks. WF attacks extract +the application-layer data (the website) while WOs reconstruct parts of the +anonymity set of possible monitored websites visited. WF attacks need to observe +most of the traffic generated when visiting a website that goes into the +anonymity network. While a WO does not have to directly view any of the output +flows of the network, it needs to be able to infer if a particular website was +visited during a period of time, as shown in Section~\ref{cat:sec:sources}. diff --git a/summary/src/cat/src/sim.tex b/summary/src/cat/src/sim.tex new file mode 100644 index 0000000..4077b89 --- /dev/null +++ b/summary/src/cat/src/sim.tex @@ -0,0 +1,131 @@ +\section{Simulating Website Oracles} \label{cat:sec:sim} +To be able to \emph{simulate} access to a WO for \emph{arbitrary monitored +websites} we need to simulate the entire website anonymity set of Tor, because +the anonymity set is what a WO queries for membership. We opt for simulation for +ethical reasons. The simulation has three key parts: how those visits are +distributed, the number of visits to websites over Tor, and the timeframe +(resolution) of the oracle source. Note that the first two parts are easy for an +attacker to estimate by simply observing traffic from live Tor exit relays, +something we cannot trivially do as researchers adhering to Tor's research +safety +guidelines~\cite{tor-safety-board}. +Another option available to an attacker is to repeatedly query a WO to learn +about the popularity of its monitored websites and based on those figures infer +the utility of the WO. We opted to not perform such measurements ourselves, +despite access to several WOs, due to fears of inadvertently harming Tor users. +Instead we base our simulations on results from the privacy-preserving +measurements of the Tor network in early 2018 by Mani +\emph{et~al.}~\cite{torusage}. + +\subsection{How Website Visits are Distributed} +\label{cat:sec:sim:dist} +Table~\ref{cat:table:visits} shows the average inferred website popularity from Mani +\emph{et~al.}~\cite{torusage}. The average percentage does not add up to 100\%, +presumably due to the privacy-preserving measurement technique or rounding +errors. Their results show that \texttt{torproject.org} is very popular (perhaps +due to a bug in software using Tor), and beyond that focus on +Alexa's~\cite{alexa} top one million most +popular websites as bins. The ``other'' category is for websites identified not +part of Alexa's top one million websites ranking. For the rest of the analysis +(not simulation) in this paper we \emph{exclude} \texttt{torproject.org}: for one, +that Tor users visit that website is unlikely to be an interesting fact for an +attacker to monitor, and its over-representation (perhaps due to a bug) will +skew our analysis. Excluding \texttt{torproject.org}, about one third of all +website visits go to Alexa (0,1k], one third to Alexa (1k,1m], and one third to +other websites. The third column of Table~\ref{cat:table:visits} contains adjusted +average percentages. + +\begin{table}[!t] +\caption{Inferred average website popularity for the entire Tor network early 2018, from Mani \emph{et~al.}~\cite[Figure 2]{torusage}.} +\centering +\begin{tabular}{lcc} +Website & Average & Without\\ + & primary domain (\%) & torproject.org\\ +\midrule +torproject.org & 40.1 & \\ +Alexa (0,10] & 8.4 & 13.9 \\ +Alexa (10,100] & 5.1 & 8.4 \\ +Alexa (100,1k] & 6.2 & 10.3 \\ +Alexa (1k,10k] & 4.3 & 7.1 \\ +Alexa (10k,100k] & 7.7 & 12.7\\ +Alexa (100k,1m] & 7.0 & 11.6 \\ +other & 21.7 & 35.9 \\ +\end{tabular} +\label{cat:table:visits} +\end{table} + +In our simulations for website visits we treat the entries in column two of +Table~\ref{cat:table:visits} as bins of a histogram with the relative size indicated +by the average website popularity. After randomly selecting a bin (weighted by +popularity), in the case of an Alexa range we uniformly select a website within +the range, and for the other category we uniformly select from one million other +websites. This is a conservative choice given that there are hundreds of +millions of active websites on the Internet. Uniformly selecting within a bin +will make the more popular websites in the bin likely underrepresented while +less popular websites in the bin get overrepresented. However, we typically +simulate an attacker that monitors $\approx$100 websites and use the website +popularity as the starting rank of the first monitored website. For the most +popular websites, monitoring 100 websites covers the entire or significant +portions of the bins (Alexa $\leq$1k), and for less popular websites (Alexa +$>$1k), as our results later show, this does not matter. + +\subsection{The Number of Website Visits} +Mani \emph{et~al.} also inferred with a 95\% confidence interval that +$(104\pm36)*10^6$ \emph{initial} streams are created during a 24 hour period in +the entire Tor network \cite{torusage}. Based on this, in our simulation we +assume 140 million website visits per day that are distributed as described +above and occur uniformly throughout the day. While assuming uniformity is +naive, we selected the upper limit of the confidence interval to somewhat negate +any unreasonable advantage to the attacker. + +\subsection{A Reasonable Timeframe} +\label{cat:sec:sim:timeframe} +Wang and Goldberg show that it is realistic to assume that an attacker can +determine the start of a webpage load even in the presence of background noise +and multiple concurrent website visits~\cite{DBLP:journals/popets/WangG16}. An +attacker can further determine if a circuit is used for onion services or +not~\cite{DBLP:conf/uss/KwonALDD15,DBLP:conf/wpes/PanchenkoMHLWE17}. Now, +consider an attacker that observes traffic between a Tor client and its guard. +The initial stream contains the first HTTP GET request for a typical website +visit. The request will be the first outgoing packet as part of a website visit +once a connection has been established. When the request arrives at the +destination is the point in time when an oracle, e.g., instantiated by access +logs would record this time as the time of visit. Clearly, the exact time is +between the request and the response packets and the attacker observes the +timing of those packets. So what is a realistic timeframe for the attacker to +use when it queries a WO? + +Between January 22--30 (2019) we performed Round-Trip Time (RTT) measurements +using four Amazon EC2 instances that ran \emph{their own} nginx HTTP(S) servers +to visit \emph{themselves} over Tor (with \texttt{torify curl}) using a fresh +circuit for each visit. This allowed us easy access to start and stop times for +the RTT measurement, as well as the time a request appeared in the nginx access +log (without any clock-drift). In total we collected 21,497 HTTP traces and +21,492 HTTPS traces, where each trace contains start, log, and stop timestamps. +Our results are shown in Figure~\ref{cat:fig:aws}. It is clear that that log-to-stop +times are independent of HTTP(S). More than half of all log-to-stop times +($54.5$\%) are within a 100~ms window (see 40--140~ms), and nearly all +log-to-stop times are less than 1000~ms. + +\begin{figure}[!t] + \centering + \includegraphics[width=0.7\textwidth]{src/cat/img/aws} + \caption{% + Time differences between start, log, and stop events when visiting a website over HTTP(S) using Tor. + } + \label{cat:fig:aws} +\end{figure} + +Based on our experiment results we consider three timeframes relevant: 10 ms, +100 ms, and 1000 ms. First, 10 ms is relevant as close to optimal for any +attacker. On average, there are only 17 website visits during a 10 ms window in +the entire Tor network. 100 ms is our default for the WF experiments we perform: +we consider it realistic for many sources of WOs (e.g., Cloudflare logs and +real-time bidding). We also consider a 1000 ms timeframe relevant due to the +prevalence of sources of WOs with a resolution in seconds (e.g., due to Unix +timestamps or TTLs for DNS). Based on our simulations and the different +timeframes, Appendix~\ref{cat:app:bayes} contains an analysis of the utility of WOs +using Bayes' law. Appendix~\ref{cat:app:lessons} presents some key lessons from the +simulation, in particular that while the resolution and resulting timeframe is +an important metric in our simulation, it is minor in comparison to the overall +website popularity in Tor of the monitored websites. diff --git a/summary/src/cat/src/sources.tex b/summary/src/cat/src/sources.tex new file mode 100644 index 0000000..89616ad --- /dev/null +++ b/summary/src/cat/src/sources.tex @@ -0,0 +1,204 @@ +\section{Sources of Website Oracles} \label{cat:sec:sources} +There are a wide range of potential sources of WOs. Table~\ref{cat:table:sources} +summarizes a selection of sources that are more thoroughly detailed in +Appendix~\ref{cat:app:sources}. The table shows the availability of the source, +i.e., if the attacker needs to query the source in near real-time as a website +visit occurs or if it can be accessed retroactively, e.g., through a legal +request. We also estimate qualitatively the false positive rate of the source, +its coverage of websites it can monitor (or fraction of Tor network traffic, +depending on source), as well as the estimated effort to access the source. +Finally, the table gives an example of an actor with access to the source. + +Next we focus on a number of sources of WOs that we find particularly relevant: +several due to DNS in Section~\ref{cat:sec:sources:dns}, the DHT of Tor onion +directory services in Section~\ref{cat:sec:sources:dht}, and real-time bidding +platforms in Section~\ref{cat:sec:sources:rtb}. + +\begin{sidewaystable} + \caption{Comparison of a number of WO sources based on their \emph{estimated} time of availability (when attacker likely has to collect data, i.e., retroactively or real-time), False Positive Rate (FPR), coverage of website/network visits, and primary entities with access.} + \centering + \label{cat:table:sources} + \begin{tabular}{l c c c c r} + Source & Availability & FPR & Coverage & Effort & Access \\ \midrule + Dragnet surveillance programmes & retroactive & negl. & high & high & intelligence agencies \\ + Content Delivery Networks & retroactive & negl. & high & high & operators\\ + Real-time bidding & real-time (retroactive) & negl. & high & modest & customers (operator)\\ + Webserver access logs & retroactive & negl. & high & medium & operators\\ + Middleboxes & retroactive~\cite{bluecoat} & negl. & medium & medium & operators \\ + OCSP & retroactive & low & high & medium & few CAs, plaintext\\ + 8.8.8.8 operator & retroactive & low~\cite{DBLP:conf/ndss/GreschbachPRWF17} & 16.8\% of visits & high & Google, plaintext \\ + 1.1.1.1 operator & retroactive & low~\cite{DBLP:conf/ndss/GreschbachPRWF17} & 7.4\% of visits & high & Cloudflare, plaintext \\ + Exit relays & real-time & negl. & low & low & operators \\ + Exit relays DNS cache & real-time & medium & high & medium & anyone\\ + Query DNS resolvers & real-time & high & low & low & anyone \\ + Onion v2 (v3) & real-time & negl. & high (low) & low (high) & anyone \\ + % & & & & & \\ + \end{tabular} +\end{sidewaystable} + +\subsection{DNS} +\label{cat:sec:sources:dns} +Before a website visit the corresponding domain name must be resolved to an IP +address. For a user that uses Tor browser, the exit relay of the +current circuit resolves the domain name. If the DNS record of the domain +name is already cached in the DNS cache of the exit relay, then the exit relay +uses that record. Otherwise the domain name is resolved and subsequently cached +using whichever DNS resolution mechanism that the exit relay has configured. +Based on this process we present three sources of WOs that work for unpopular +websites. + +\subsubsection{Shared Pending DNS Resolutions} +If an exit relay is asked to resolve a domain name that is uncached it will +create a list of pending connections waiting for the domain resolution to +finish. If another connection asks that the same domain name be resolved, it is +added to the list of pending connections. When a result is available all +pending connections are informed. This is the basis of a WO: if a request to +resolve a domain name returns a record \emph{more quickly than previously +measured by the attacker for uncached entries}, the entry was either pending +resolution at the time of the request or already cached. Notably this works +regardless of if exit relays have DNS caches or not. However, the timing +constraints of shared pending connections are significant and thus a practical +hurdle to overcome. + +\subsubsection{Tor's DNS Cache at Exit Relays} +If an unpopular website is visited by a user, the resolved domain name will +likely be cached by a \emph{single} relay. We +performed 411 \texttt{exitmap}~\cite{DBLP:conf/pet/WinterKMHSLW14} +measurements between April 1--10 (2019), collecting on average 3544 +(un)cached data points for each exit using a domain under our control +that is not used by anyone else. + +Given a labelled data set of (un)cached times for each exit relay, we can +construct distinct \emph{per-relay} classifiers that predict whether a measured +time corresponds to an (un)cached domain name. While there are many different +approaches that could be used to build such a classifier, we decided to use a +simple heuristic that should result in little or no false positives: output +`cached' iff no uncached query has \emph{ever} been this fast before. +Figure~\ref{cat:fig:dns:classifier-idea} shows the idea of this classifier in +greater detail, namely create a \emph{threshold} that is the minimum of the +largest cached time and the smallest uncached time and then say cached iff the +measured time is smaller than the threshold. Regardless of how well this +heuristic performs (see below), it should be possible to construct other +classifiers that exploit the trend of smaller resolve times and less standard +deviation for cached queries (Figure~\ref{cat:fig:dns:dist}). For example, 69.1\% of +all exit relays take at least 50~ms more time to resolve an uncached domain on +average. + +\begin{figure}[!t] + \centering + \includegraphics[width=.7\columnwidth]{src/cat/img/dns__classifier-idea} + \caption{The two cases when deciding on a classifier's threshold.} + \label{cat:fig:dns:classifier-idea} +\end{figure} + +\begin{figure}[!t] + \centering + \includegraphics[width=.7\columnwidth]{src/cat/img/dns__timing-dist} + \caption{% + The difference between (un)cached standard deviation and mean times + without any absolute values, i.e., a negative value implies + that the uncached time is smaller than the cached time. + } + \label{cat:fig:dns:dist} +\end{figure} + + +To estimate an \emph{upper bound} on how effective the composite classifier of +all per-relay classifiers could be \emph{without any false positives} using +our heuristic, we applied ten-fold cross-validation to simply exclude every +exit relay that had false positives during any fold and then weighted the +observed bandwidth for the remaining classifiers by the individual true positive +rates. This gives us an +estimate of how much bandwidth we could predict true positives for without +having any false positives. By comparing it to the total exit bandwidth of the +Tor network, we obtain an estimated upper bound true positive rate for the +composite classifier of $17.3$\%. + +When an attacker measures if a domain is cached or not the domain will, after +the measurement, be cached for up to an hour (current highest caching duration +in Tor, independent of TTL) at every exit. However, if a an attacker can cause +an exit to run low on memory, the entire DNS cache will be removed (instead of +only parts of it) due to a bug in the out-of-memory manager of Tor. We have +reported this to the Tor +Project~\cite{bug-report}. +We further discuss in Section~\ref{cat:sec:disc} how frequently an attacker on +average can be expected to query a WO. + +\subsubsection{Caching at Recursive DNS Resolvers} +For a website that is unpopular enough, there is a high chance that nobody on +the web visited the website within a given timeframe. This is the basis of +our next idea for a WO which is \emph{not mutually exclusive} to the Tor +network: wait a couple of seconds after observing a connection, then probe all +recursive DNS resolvers of Tor exits that can be accessed to determine whether +any monitored website was cached approximately at the time of observing the +connection by inspecting TTLs. %the TTL of returned DNS records. + +In 2016 Greschbach~\emph{et~al.}~\cite{DBLP:conf/ndss/GreschbachPRWF17} showed +that remote DNS resolvers like Google's \texttt{8.8.8.8} receive a large +portion of all DNS traffic that exit relays generate. To better understand how +the DNS resolver landscape looks today, we repeated their RIPE Atlas experiment +setup for 35 hours in February 17--18 (2019), measuring every 30 minutes. Our +results show that Google (16.8\%) and Cloudflare (7.4\%) are both popular. Many +exits use a same-AS resolver which is presumably the ISP (42.3\%), while other +exits resolve themselves (15.2\%) or use a remote DNS resolver that we did not +identify (18.2\%). Further, we note that there are at least one RIPE Atlas +network measurement probe in the same AS as 53.3\% of all exits, providing +access to many of the same DNS resolvers as used by exits from a similar network +vantage point. + +Instead of using RIPE Atlas nodes we opted for a different approach which is +\emph{strictly worse}: query Google's and Cloudflare's DNS resolvers from VMs in +16 Amazon EC2 regions. With a simple experiment of first visiting a unique +domain (once again under our control and only used by us) using \texttt{torify +curl} and then querying the DNS resolvers from each Amazon VM to observe TTLs, +we got true positive rates of 2.9\% and 0.9\% for Google and Cloudflare with +1000 repetitions. While this may seem low, the cost for an attacker is at the +time of writing about 2 USD per day using on-demand pricing. Using an identical +setup we were also able to find a subset of monitored websites that yield +alarmingly high true positive rates: 61.4\% (Google) and 8.0\% (Cloudflare). +Presumably this was due to the cached entries being shared over a wider +geographical area for some reason (however, not globally). Regardless, coupled +with the fact that anyone can \emph{globally purge} the DNS caches of +Google~\cite{google-purge} and Cloudflare~\cite{cf-purge} for arbitrary domain +names, this is a noteworthy WO source. + +\subsection{Onion Service Directories in Tor} +\label{cat:sec:sources:dht} +To access an onion service a user first obtains the service's \emph{descriptor} +from a Distributed Hash Table (DHT) maintained by \emph{onion service +directories}. From the descriptor the user learns of \emph{introduction points} +selected by the host of the onion service in the Tor network that are used to +establish a connection to the onion service in a couple of more +steps~\cite{onionv2,onionv3} that are irrelevant here. Observing a request for +the descriptor of a monitored onion service is a source for a WO. To observe +visits for a target (known) onion service in the DHT, a relay first has to be +selected as one out of six or eight (depending on version) relays to host the +descriptor in the DHT, and then the victim has to select that relay to retrieve +the descriptor. For v2 of onion services, the location in the DHT is +deterministic~\cite{onionv2} and an attacker can position its relays in such a +way to always be selected for hosting target descriptors. Version 3 of onion +services addresses this issue by randomising the process every 24 +hours~\cite{onionv3}, forcing an attacker to host a significant number of relays +to get a WO for onion services with high coverage. At the time of writing, there +are about 3,500 relays operating as onion service directories. + +\subsection{Real-Time Bidding} +\label{cat:sec:sources:rtb} +Real-Time Bidding (RTB) is an approach towards online advertisement that allows +a publisher to auction ad space to advertisers on a \emph{per-visit} basis in +real time~\cite{rtb}. Google's Display Network includes more than two million +websites that reach 90\% of all Internet users~\cite{google-dn}, and an +advertiser that uses RTB must respond to submitted bid +requests containing information such as the three first network bytes of an IPv4 +address, the second-level domain name of the visited website, and the user agent +string within $\approx$100~ms~\cite{google-bid}. While the exact information +available to the bidder depends on the ad platform and the publisher's +advertisement settings, anonymous modes provide less +revenue~\cite{google-bid-anon}. +Combined with many flavours of pre-targeting such as IP and location +filtering~\cite{DBLP:conf/wpes/VinesRK17}, it is likely that the bidder knows +whether a user used Tor while accessing a monitored website. Vines et +al.~\cite{DBLP:conf/wpes/VinesRK17} further note that ``35\% of the DSPs also +allow arbitrary IP white-and blacklisting (Admedo, AdWords, Bing, BluAgile, +Criteo, Centro, Choozle, Go2Mobi, Simpli.fi)''. Finally, observe that an +attacker need not win a bid to use RTB as a WO. diff --git a/summary/src/cat/src/wf.tex b/summary/src/cat/src/wf.tex new file mode 100644 index 0000000..6b2dedb --- /dev/null +++ b/summary/src/cat/src/wf.tex @@ -0,0 +1,181 @@ +\section{Deep Fingerprinting with Website Oracles} \label{cat:sec:wf} +We first describe how we augment the Deep Fingerprinting (DF) attack by Sirinam +\emph{et~al.}~\cite{DF} with WO access. Next we evaluate the augmented +classifier on three different datasets with five different WF defenses. Source +code and datasets for simulating WF+WO attacks as well as steps to reproduce all +of the following results using DF are available at +\href{https://github.com/pylls/wfwo}{https://github.com/pylls/wfwo}. + +\subsection{The Augmented Classifier} +\label{cat:sec:wf:aug} +As covered in the background (Section~\ref{cat:sec:back:wf}), DF is a CNN where the +last layer is a softmax. The output is an array of probabilities for each +possible class. Compared to the implementation of DF used by Sirinam +\emph{et~al.}, we changed DF to not first use binary classification in the open +world to determine if it is an unmonitored trace or not, but rather such that +there is one class for each monitored website and one for unmonitored. +Conceptually, this slightly lowers the performance of DF in our analysis, but +our metrics show that mistaking one monitored website for another is +insignificant for the datasets used in the analysis of this paper. The principal +source of false positives is mistaking an unmonitored website for a monitored. + +Given the probability of each possible class as output of DF, we used the second +generic construction (Definition~\ref{cat:def:oracleprob}) from +Section~\ref{cat:sec:oracles:generic} to combine DF with a WO. To update the +remaining probabilities after removing a (monitored) prediction with the help of +the WO, we use a softmax again. However, due to how the softmax function is +defined, it emphasizes differences in values above one and actually +de-emphasizes values between zero and +one~\cite{wiki}. +This is problematic for us because all values we send through the softmax are +probabilities that per definition are between zero and one. To account for this, +we first divide each probability with the maximum probability and multiply with +a constant before performing the softmax. Through trial-and-error, a constant of +five gave us a reasonable threshold in probabilities. Note that this does not in +any way affect the order of likely classes from DF, it simply puts the +probabilities in a span that makes it easier for us to retain a threshold value +between zero and one after multiple calls to the softmax function. + +\subsection{WTF-PAD and Walkie-Talkie} +We use the original dataset of Sirinam \emph{et~al.}~\cite{DF} that consists of +95 monitored websites with 1,000 instances each as well as 20,000 unmonitored +websites (95x1k+20k). The dataset is split 8:1:1 for training, validation, and +testing, respectively. Given the dataset and our changes to DF to not do binary +classification means that our testing dataset is unbalanced in terms of +instances per class. Therefore we show precision-recall curves generated by +alternating the threshold for DF with and without WO access. + +Figure~\ref{cat:fig:df} shows the results of DF and DF+WO with a simulated WO on +Sirinam \emph{et~al.}'s dataset with no defense (Figure~\ref{cat:fig:df:nodef}), +Walkie-Talkie (Figure~\ref{cat:fig:df:wt}), and WTF-PAD +(Figure~\ref{cat:fig:df:wtfpad}). For the WO we use a 100 ms timeframe and plot the +results for different starting Alexa ranks of the 95 monitored websites. +Regardless of defense or not, we observe that for Alexa ranks 1k and less +popular websites the precision is perfect (1.0) regardless of threshold. This +indicates that---for an attacker monitoring frontpages of websites---a 100 ms WO +significantly reduces false positives for two-thirds of all website visits made +over Tor, for the vast majority of potentially monitored frontpages of websites. +Recall is also slightly improved. + +\begin{figure}[!t] + \centering + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/df_nodef} + \caption{No defense.} + \label{cat:fig:df:nodef} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/df_wt} + \caption{Walkie-Talkie~\cite{WT}.} + \label{cat:fig:df:wt} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/df_wtfpad} + \caption{WTF-PAD~\cite{wtf-pad}.} + \label{cat:fig:df:wtfpad} + \end{subfigure} + \caption{Attack simulation for Deep Fingerprinting (DF) with website oracles (100 ms timeframe) on Sirinam \emph{et~al.}'s dataset~\cite{DF}. The lines in each sub-figure show DF with and without website oracle access for different starting Alexa ranks for monitored websites.} + \label{cat:fig:df} +\end{figure} + +For Walkie-Talkie we observe a significant improvement in precision due to WO +access. Wang and Goldberg note that the use of popular websites as decoy +(non-sensitive) websites protects less-popular sensitive websites due to the +base rate: an attacker claiming that the user visited the less-popular website +is (per definition) likely wrong, given that the attacker is able to detect both +potential website visits~\cite{WT}. Access to a WO flips this observation on its +head: if a WO detects the sensitive less-popular website, the base rate works in +reverse. The probability of an unpopular website being both miss-classified and +visited in the timeframe is small for all but the most popular websites. The key +question becomes one of belief in the base rate of the network and that of the +target user, as analysed in Appendix~\ref{cat:app:bayes}. + +Further, WO access improves both recall and precision for all monitored websites +against WTF-PAD. WTF-PAD only provides a three percentage points decrease in +recall compared to no defense for monitored websites with Alexa ranks 1k and +above. + +\subsection{CS-BuFLO and Tamaraw} +To evaluate the constant-rate defenses CS-BuFLO and Tamaraw by Cai et +al.~\cite{csbuflo,Tamaraw} we use Wang \emph{et~al.}'s dataset in the open world +\cite{Wang}. The dataset consists of 100 monitored websites with 90 instances +each and 9000 unmonitored sites (100x90+9k), that we randomly split (stratified) +into 8:1:1 for training, validation, and testing. We had to increase the length +of the input to DF for this dataset, from 5000 to 25000, to ensure that we +capture most of the dataset. To get defended traces for CS-BuFLO and Tamaraw we +use the slightly modified implementations as part of Cherubin's +framework~\cite{Cherubin17}. + +Figure~\ref{cat:fig:wang} shows the results of our simulations. DF alone is also +highly effective against the original Wang dataset---as expected---and our +attack simulation shows that we can further improve it with access to website +oracles. Most importantly, both CS-BuFLO and Tamaraw offer protection against DF +with and without oracle access by \emph{significantly lowering recall}. Tamaraw +offers an order of magnitude better defense in terms of recall. As implemented +in the framework by Cherubin, CS-BuFLO and Tamaraw reportedly has BOH 67.2\% and +256.7\%, and TOH 575.6\% and 341.4\%, respectively. This kind of overhead is +likely prohibitively large for real-world deployment in +Tor~\cite{csbuflo,Tamaraw,wtf-pad,DF,WT}. + +\begin{figure}[!t] + \centering + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/wang_nodef} + \caption{No defense.\\\,} + \label{cat:fig:wang:nodef} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/wang_csbuflo} + \caption{CS-BuFLO~\cite{csbuflo}, with reported 67.2\% BOW and 575.6\% TOH~\cite{Cherubin17}.} + \label{cat:fig:wang:csbuflo} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/wang_tamaraw} + \caption{Tamaraw~\cite{Tamaraw}, with reported 256.7\% BOW and 341.4\% TOH~\cite{Cherubin17}.} + \label{cat:fig:wang:tamaraw} + \end{subfigure} + \caption{Attack simulation for Deep Fingerprinting (DF)~\cite{DF} with website oracles (100 ms timeframe) on Wang \emph{et~al.}'s dataset~\cite{Wang}. The lines in each sub-figure show DF with and without website oracle access for different starting Alexa ranks for monitored websites.} + \label{cat:fig:wang} +\end{figure} + + +\subsection{DynaFlow} +DynaFlow is a \emph{dynamic} constant-rate defense by Lu +\emph{et~al.}~\cite{DynaFlow} with two configurations that result in different +overheads and levels of protection. Lu \emph{et~al.} gathered their own dataset +of 100 monitored websites with 90 instances each and 9000 unmonitored websites +(100x90+9k, same as Wang \emph{et~al.}'s \cite{Wang}) to be able to combine +smaller packets, as discussed briefly in Section~\ref{cat:sec:back:wf}. As for +CS-BuFLO and Tamaraw, we had to increase the length of the input to DF for this +dataset to 25000 to ensure that we capture most of the dataset. + +Figure~\ref{cat:fig:dynaflow} shows the results of our simulations for no defense as +well as the two configurations of DynaFlow. As for Wang \emph{et~al.}'s +dataset~\cite{Wang}, we see as expected that DF is highly effective and WO +access further improves the attack. Further, both configurations of DynaFlow are +effective defenses, comparable to CS-BuFLO with significantly lower overheads at +first glance. However, note that the comparison is problematic due to DynaFlow +combining smaller packets. The extra overhead for config 2 over 1 is not wasted: +recall is significantly reduced, more than halved for regular DF and slightly +less than half with a WO. + +\begin{figure}[!t] + \centering + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/dynaflow_nodef} + \caption{No defense.\\\,} + \label{cat:fig:dynaflow:nodef} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/dynaflow_config1} + \caption{DynaFlow~\cite{DynaFlow} config 1, with measured 59\% BOH and 24\% TOH.} + \label{cat:fig:dynaflow:config1} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/dynaflow_config2} + \caption{DynaFlow~\cite{DynaFlow} config 2, with measured 109\% BOH and 30\% TOH.} + \label{cat:fig:dynaflow:config2} + \end{subfigure} + \caption{Attack simulation for Deep Fingerprinting (DF)~\cite{DF} with website oracles (100 ms timeframe) on Lu \emph{et~al.}'s dataset~\cite{DynaFlow}. The lines in each sub-figure show DF with and without website oracle access for different starting Alexa ranks for monitored websites.} + \label{cat:fig:dynaflow} +\end{figure} diff --git a/summary/src/ctga/.gitignore b/summary/src/ctga/.gitignore new file mode 100644 index 0000000..8bb88c8 --- /dev/null +++ b/summary/src/ctga/.gitignore @@ -0,0 +1,9 @@ +main.pdf +*.blg +*.bbl +*.fls +*.fdb_latexmk +*.log +*.out +*.aux +*.swp diff --git a/summary/src/ctga/img/design.pdf b/summary/src/ctga/img/design.pdf new file mode 100644 index 0000000..3a9aba1 Binary files /dev/null and b/summary/src/ctga/img/design.pdf differ diff --git a/summary/src/ctga/img/parser.tex b/summary/src/ctga/img/parser.tex new file mode 100644 index 0000000..dba221b --- /dev/null +++ b/summary/src/ctga/img/parser.tex @@ -0,0 +1,66 @@ +\resizebox{\columnwidth}{!}{% +\begin{tikzpicture}[% + -latex, + sibling distance=10em, + level distance=22pt, + parser/.style = {% + draw, + shape=rectangle, + rounded corners, + align=center, + top color=white, + bottom color=mydblue!20, + }, + label/.style = {% + draw=none, + align=center, + text=mydblue, + font=\scriptsize, + }, + arrow/.style = {% + draw, + -latex, + rounded corners, + }, +] + + \node[parser](eth){Ethernet}; + \node[parser,right=of eth](udp){UDP}; + \coordinate(ip) at ($ (eth) !.5! (udp) $); + \node[parser,above=of ip](ipv4){IPv4}; + \node[parser,below=of ip](ipv6){IPv6}; + + \node[parser,right=of udp](dns){DNS}; + \node[parser](dnsp) at ($ (dns) + (1,1.1) $){preamble}; + \node[parser](dnsq) at ($ (dns) + (2.25,.45) $){domain name}; + \node[parser](dnst) at ($ (dns) + (2.25,-.45) $){query type}; + \node[parser](dnsc) at ($ (dns) + (1,-1.1) $){query class}; + + \path[arrow] ($ (eth) + (-1.25,0) $) -- node[left,pos=0]{\texttt{pkt\_in}} (eth); + + \path[arrow] (eth) |- node[label,above left,pos=1]{\texttt{type=0x0800}} (ipv4); + \path[arrow] (eth) |- node[label,below left,pos=1]{\texttt{type=0x86DD}} (ipv6); + \path[arrow] (ipv4) -| node[label,above right, pos=0]{\texttt{proto=0x17}} (udp); + \path[arrow] (ipv6) -| node[label,below right, pos=0]{\texttt{proto=0x17}} (udp); + \path[arrow] (udp) -- node[label,above]{\texttt{sport=53}} (dns); + + \path[arrow,dashed] (dns) |- (dnsp); + \path[arrow,dashed] (dnsp) -| node[label,above right,pos=0]{\texttt{qd=an=1}} (dnsq); + \path[arrow,dashed] (dnsq) -- node[label,right]{\texttt{known log}} (dnst); + \path[arrow,dashed] (dnst) |- node[label,below right,pos=1]{\texttt{TXT}} (dnsc); + \path[arrow,dashed] + (dnsq) edge[out=5, in=355, looseness=8] + node[label,right]{ + \begin{tabular}{c} + variable \\ + length + \end{tabular} + } + (dnsq); + \path[arrow,dashed] + (dnsc) -- + node[label,below right,pos=.8]{\texttt{IN}} + node[pos=1.2,rotate=90]{\texttt{clone}} + ($ (dnsc) + (-1.5,0) $); +\end{tikzpicture} +} diff --git a/summary/src/ctga/img/perf-netfpga.pdf b/summary/src/ctga/img/perf-netfpga.pdf new file mode 100644 index 0000000..309a689 Binary files /dev/null and b/summary/src/ctga/img/perf-netfpga.pdf differ diff --git a/summary/src/ctga/img/perf-xdp.pdf b/summary/src/ctga/img/perf-xdp.pdf new file mode 100644 index 0000000..041cbdd Binary files /dev/null and b/summary/src/ctga/img/perf-xdp.pdf differ diff --git a/summary/src/ctga/img/pl.pdf b/summary/src/ctga/img/pl.pdf new file mode 100644 index 0000000..0b39f91 Binary files /dev/null and b/summary/src/ctga/img/pl.pdf differ diff --git a/summary/src/ctga/img/ps.pdf b/summary/src/ctga/img/ps.pdf new file mode 100644 index 0000000..cf3db16 Binary files /dev/null and b/summary/src/ctga/img/ps.pdf differ diff --git a/summary/src/ctga/img/related.tex b/summary/src/ctga/img/related.tex new file mode 100644 index 0000000..31a86e8 --- /dev/null +++ b/summary/src/ctga/img/related.tex @@ -0,0 +1,37 @@ +\resizebox{1\columnwidth}{!}{% + \begin{tikzpicture}[% + ns/.style = { + draw=none, + }, + ps/.style = { + draw, + -latex, + }, + ] + \node[ns](gossip){}; + \node[ns,right=0pt of gossip](retroactive){\textbf{Retroactive}}; + \node[ns,left=0pt of gossip](proactive){\textbf{Proactive}}; + + % proactive + \node[ns,left=12pt of proactive](cross){STH cross-logging~\cite{minimal-gossip,ietf-cross-logging,hof-cross-logging,catena}}; + \node[ns,above=0pt of cross](push){STH pushing~\cite{google-gossip}}; + \node[ns,below=0pt of cross](cosi){STH cosigning~\cite{cosi}}; + + \path[ps] (proactive) -- (push.east); + \path[ps] (proactive) -- (cross); + \path[ps] (proactive) -- (cosi.east); + + % retroactive + \node[ns,right=12pt of retroactive](implicit){Implicit via multipath~\cite{mpaudit}}; + \node[ns,above=0pt of implicit](pool){STH pooling~\cite{chuat-gossip,ietf-gossip}}; + \node[ns,below=0pt of implicit](trust){Trusted auditing~\cite{ietf-gossip}}; + \node[ns,above=14pt of retroactive.north east](feedback){SCT feedback~\cite{ietf-gossip}}; + \node[ns,below=14pt of retroactive.south east](bee){CT honey bee~\cite{ct-honey-bee}}; + + \path[ps] (retroactive) -- (feedback); + \path[ps] (retroactive) -- (pool.west); + \path[ps] (retroactive) -- (implicit); + \path[ps] (retroactive) -- (trust.west); + \path[ps] (retroactive) -- (bee); + \end{tikzpicture} +} diff --git a/summary/src/ctga/img/wcov-goo.pdf b/summary/src/ctga/img/wcov-goo.pdf new file mode 100644 index 0000000..976e5bd Binary files /dev/null and b/summary/src/ctga/img/wcov-goo.pdf differ diff --git a/summary/src/ctga/img/wcov-nor.pdf b/summary/src/ctga/img/wcov-nor.pdf new file mode 100644 index 0000000..110cf88 Binary files /dev/null and b/summary/src/ctga/img/wcov-nor.pdf differ diff --git a/summary/src/ctga/main.tex b/summary/src/ctga/main.tex new file mode 100644 index 0000000..bc5ff45 --- /dev/null +++ b/summary/src/ctga/main.tex @@ -0,0 +1,70 @@ +\begin{kaupaper}[ + author={% + \textbf{Rasmus Dahlberg}, + Tobias Pulls, + Jonathan Vestin, + Toke H{\o}iland-J{\o}rgensen, and + Andreas Kassler + }, + title={% + Aggregation-Based Certificate Transparency Gossip + }, + reference={% + SECURWARE (2019) + }, + summary={% + Another often overlooked part of Certificate Transparency is that monitors + and end-users who browse websites must observe the same append-only + logs. For example, if the same append-only logs are not observed, an + end-user may connect to a website that serves a mis-issued certificate + that no monitor will discover. This would largely defeat the purpose of + public logging, which is why RFC~6962 specifies that multiple gossip + protocols should be defined separately in the future. We define one such + protocol that plugs into the (at the time current) idea of having + end-users interact with the logs through DNS. Our work is exploratory, + using recent advancements in programmable packet processors that allow + turning routers, switches, and network interface cards into + \emph{aggregators} of tree heads that the logs signed and transmitted in + plaintext via DNS. The aggregated tree heads are then used as a reference + while challenging the logs to prove consistency, thus protecting + entire vantage points from undetected split views. A different + network path (like Tor) can be used to break out of a local vantage point + to increase the likelihood of global consistency. If the security + definition for \emph{aggregation indistinguishability} is satisfied, + vantage points without an aggregator may also receive protection due to + herd immunity. Our P4 and XDP prototypes satisfy the notion of + aggregation indistinguishability at line-rate with regard to throughput. + Prevalent vantage points to roll out aggregation-based gossip include + autonomous systems and Internet exchange points that route the traffic of + many users. Our RIPE Atlas measurements show that 32 autonomous systems + could protect 30-50\% of the IPv4 space from undetected split views. + End-users merely need to use plaintext DNS for opt-in. + }, + participation={\vspace{-.25cm} + Andreas and Tobias had the initial idea of exploring the intersection + between Certificate Transparency and programmable packet processors. I did most of the + design and writing with feedback from Tobias, our RIPE Atlas measurements, + and our performance benchmarks with Jonathan and Toke. + }, + label={ + paper:ctga + }, +] + \maketitle + \begin{abstract} + \input{src/ctga/src/abstract} + \end{abstract} + + \input{src/ctga/src/introduction} + \input{src/ctga/src/background} + \input{src/ctga/src/design} + \input{src/ctga/src/implementation} + \input{src/ctga/src/measurements} + \input{src/ctga/src/related} + \input{src/ctga/src/discussion} + \input{src/ctga/src/conclusion} + \input{src/ctga/src/acknowledgments} + + \bibliographystyle{plain} + \bibliography{src/ctga/src/ref} +\end{kaupaper} diff --git a/summary/src/ctga/src/abstract.tex b/summary/src/ctga/src/abstract.tex new file mode 100644 index 0000000..5483f7e --- /dev/null +++ b/summary/src/ctga/src/abstract.tex @@ -0,0 +1,16 @@ +\noindent +Certificate Transparency (CT) requires that every certificate which is issued by +a certificate authority must be publicly logged. While a CT log can be +untrusted in theory, it relies on the assumption that every client observes and +cryptographically verifies the same log. As such, some form of gossip mechanism +is needed in practice. Despite CT being adopted by several major browser +vendors, no gossip mechanism is widely deployed. +We suggest an aggregation-based gossip mechanism that passively observes +cryptographic material that CT logs emit in plaintext, aggregating at packet +processors (such as routers and switches) to periodically verify log consistency +off-path. In other words, gossip is provided as-a-service by the network. Our +proposal can be implemented for a variety of programmable packet processors at +line-speed without aggregation distinguishers (throughput), and based on +20 days of RIPE Atlas measurements that represent clients from 3500 autonomous +systems we show that significant protection against split-viewing CT logs can be +achieved with a realistic threat model and an incremental deployment scenario. diff --git a/summary/src/ctga/src/acknowledgments.tex b/summary/src/ctga/src/acknowledgments.tex new file mode 100644 index 0000000..a35331b --- /dev/null +++ b/summary/src/ctga/src/acknowledgments.tex @@ -0,0 +1,5 @@ +\section*{Acknowledgements} +We would like to thank Stefan Alfredsson and Philipp Winter for their RIPE Atlas +credits, as well as Jonas Karlsson and Ricardo Santos for helping with the +NetFPGA setup. We also received funding from the HITS research profile which is +funded by the Swedish Knowledge Foundation. diff --git a/summary/src/ctga/src/background.tex b/summary/src/ctga/src/background.tex new file mode 100644 index 0000000..e924d57 --- /dev/null +++ b/summary/src/ctga/src/background.tex @@ -0,0 +1,90 @@ +\section{Background} \label{ctga:sec:background} +First additional prerequisites are provided on CT and the status quo, +then the techniques which allow us to program custom packet processors are +introduced. + +\subsection{Certificate Transparency} \label{ctga:sec:background:ct} +The main motivation of CT is that the CA ecosystem is error-prone~\cite{laurie}: + a CA can normally issue certificates for \emph{any} domain name, and + given that there are hundreds of trusted CAs an attacker only needs to + target the weakest link~\cite{ca-ecosystem}. +While the requirement of CT logging all certificates cannot prevent mis-issuance +proactively, it allows anyone to detect it retroactively by monitoring the +logs~\cite{ct}. After a log promises to include a certificate by issuing a +Signed Certificate Timestamp (SCT), a new STH including the appended certificate +must be issued within a Maximum Merge Delay (MMD). Typically, logs use 24~hour +MMDs. Should non-included SCTs and/or inconsistent STHs be found, +binding evidence of misbehaviour exists because these statements are +digitally signed by the logs. Other than MMD a log's policy defines parameters +such as STH frequency: + the number of STHs that can be issued during an MMD, making it harder to + track clients~\cite{ietf-gossip}. + +CT is being deployed across Apple's platform~\cite{apple-ct} and Google's +Chrome~\cite{google-ct}. The status quo is to trust a CA-signed certificate if +it is accompanied by two or more SCTs, thereby relying on at least one log to +append each certificate so that mis-issuance can be detected by monitors that +inspect the logs. The next step of this incremental deployment is to +\emph{verify} that these certificates are logged by querying for +inclusion~\cite{google-gossip}, and that the log's append-only property is +respected by challenging the log to prove STH consistency. Finally, to fully +distrust CT logs we need mechanisms that detect split-views. One such mechanism +which is based on programmable packet processors (introduced next) is presented +in Section~\ref{ctga:sec:design}, and it is compared to related work on CT gossip in +Section~\ref{ctga:sec:related}. + +\subsection{Programmable Data Planes} \label{ctga:sec:background:pdp} +Packet processors such as switches, routers, and network interface cards +are typically integrated tightly using customized hardware and application-% +specific integrated circuits. This inflexible design limits the +potential for innovation and leads to long product upgrade cycles, where it +takes \emph{years} to introduce new processing capabilities and support for +different protocols and header fields (mostly following lengthy +standardization cycles). +The recent shift towards flexible \emph{match+action} packet-processing +pipelines---including + RMT~\cite{rmt}, + Intel Flexpipe~\cite{flexpipe}, + Cavium XPA~\cite{cavium}, and + Barefoot Tofino~\cite{barefoot}---% +now have the potential to change the way in which packet processing hardware is +implemented: + it enables programmability using high-level languages, such as P4, + while at the same time maintaining performance comparable to fixed-function + chips. + +\subsubsection{P4} +The main goal of P4 is to simplify + \barbelow{p}rogramming of + \barbelow{p}rotocol-independent + \barbelow{p}acket + \barbelow{p}rocessors +by providing an abstract programming model for the network data plane~\cite{p4}. +In this setting, the functionality of a packet processing device is specified +without assuming any hardwired protocols and headers. Consequently, a P4 program +must parse headers and connect the values of those protocol fields to the +actions that should be executed based on a pipeline of reconfigurable +match+action tables. +Based on the specified P4 code, a front-end compiler generates a high-level +intermediate representation that a back-end compiler uses to create a target-% +dependent program representation. Compilers are available for several platforms, +including + the software-based simple switch architecture~\cite{p4bm}, + SDNet for Xilinx NetFPGA boards~\cite{p4netfpga}, and + Netronome's smart network interfaces~\cite{p4netronome}. +It is also possible to compile basic P4 programs into eBPF byte +code~\cite{p42ebpf}. + +\subsubsection{XDP} +The Berkeley Packet Filter (BPF) is a Linux-based packet filtering +mechanism~\cite{bpf}. Verified bytecode is injected from user space, and +executed for each received packet in kernel space by a just-in-time compiler. +Extended BPF (eBPF) +enhances the original BPF concept, enabling faster runtime and many new +features. For example, an eBPF program can be attached to the Linux traffic +control tool \texttt{tc}, and additional hooks were defined for a faster eXpress +Data Path (XDP)~\cite{xdp}. In contrast to the Intel Data Plane Development Kit +(DPDK), which runs in user space and completely controls a given network +interface that supports a DPDK driver, XDP cooperates with the Linux stack to +achieve fast, programmable, and reconfigurable packet processing using C-like +programs. diff --git a/summary/src/ctga/src/conclusion.tex b/summary/src/ctga/src/conclusion.tex new file mode 100644 index 0000000..f001ace --- /dev/null +++ b/summary/src/ctga/src/conclusion.tex @@ -0,0 +1,23 @@ +\section{Conclusion and Future Work} \label{ctga:sec:conclusion} +Wide spread modifications of TLS clients are inevitable to support CT gossip. +We propose that these modifications include challenging the logs to +prove certificate inclusion based on STHs \emph{fetched in plaintext}, thereby +enabling the traversed packet processors to assist in split view detection +retroactively by aggregating STHs for periodic off-path verification. Our +results show that the aggregation-step can be implemented without throughput-% +based distinguishers for a distant attacker, and that our approach offers rapid +incremental deployment with high impact on a significant fraction of Internet +users. Beyond being an application neutral approach that is complementary to +proactive gossip, a compelling aspect is that core packet processors are used +(rather than clients) as a key building block: + should a consistency issue arise, it is already in the hands of an actor + that is better equipped to investigate the cause manually. +Further, considering that far from all TLS clients are backed by big browser +vendors (not to mention other use-cases of transparency logs in general) it is +likely a long-term win to avoid pushing complex retroactive gossip logic into +all the different types of clients when there are orders of magnitudes fewer +packet processors that could aggregate to their own off-path challengers. +Future work includes different instantiations of the aggregation step and +evaluating whether aggregation indistinguishability is provided based on +throughput and/or latency. The setting may also change in some scenarios, +e.g., if DNS caches are aggregated the transport need not be plaintext. diff --git a/summary/src/ctga/src/design.tex b/summary/src/ctga/src/design.tex new file mode 100644 index 0000000..a04e36b --- /dev/null +++ b/summary/src/ctga/src/design.tex @@ -0,0 +1,129 @@ +\section{Design} \label{ctga:sec:design} +An overview of aggregation-based gossip is shown in Figure~\ref{ctga:fig:agg}. The +setting consists of logs that send plaintext STHs to clients over a network, and +as part of the network inline \emph{packet processors} passively aggregate +observed STHs to their own off-path \emph{challengers} which challenge the logs +to prove consistency. A log cannot present split views to different clients that +share an aggregating vantage point because it would trivially be detected by +that vantage point's challenger. A log also cannot present a persistent split +view to different challengers because they are off-path in the sense that they +are indistinguishable from one another. This means that every client that is +covered by an aggregator must be on the same view because at least one +challenger will otherwise detect an inconsistency and report it. A client that +is not directly covered by an aggregator may receive indirect protection in the +form of herd immunity. This is discussed in Section~\ref{ctga:sec:discussion:herd}. +\begin{figure}[!t] + \centering + \includegraphics[width=\columnwidth]{src/ctga/img/design.pdf} + \caption[ ]{% + Packet processor that aggregates plaintext STHs for off-path verification. + } + \label{ctga:fig:agg} +\end{figure} + +\subsection{Threat Model and Security Notion} \label{ctga:sec:agg:thr} +The overarching threat is undetectable domain impersonation (ex-post) by an +attacker that is capable of compromising at least one CA and a sufficient number +of CT logs to convince a client into accepting a forged certificate. +We assume that any illegitimately issued certificate +would be detected by the legitimate domain owner through self or delegated +third-party monitoring. +This means that an attacker must either provide a split view towards the victim +or the monitoring entity. +We also assume that clients query the logs for certificate inclusion based on +STHs that they acquire from the logs via plaintext mechanisms that packet +processors can observe, and that some other entities than challengers process +STHs using the chosen off-paths (Section~\ref{ctga:sec:discussion:limitations}). +We do not account for the fact that CA compromises may be detected by other +means, focusing solely on split-viewing CT logs. + +\subsubsection{Limitations} +Our gossip mechanism is limited to STHs that packet processors can observe. +As such, a client isolated by an attacker is not protected. We limit ourselves +to attackers that act over a network some distance +(in the sense of network path length) from a client in plaintext so that +aggregation can take place. Our limitations and assumptions are further +discussed in Section~\ref{ctga:sec:discussion:limitations}. + +\subsubsection{Attackers} +Exceptionally powerful attackers can isolate clients, \emph{but clients are not +necessarily easy to isolate} for a significant number of relevant attackers. +Isolation may require physical control over a device~\cite{fbi-apple}, +clients may be using anonymity networks like Tor where path selection is +inherently unpredictable~\cite{tor}, or sufficiently large parts of the network +cannot be controlled to ensure that no aggregation takes place. +This may be the case if we consider + a nation state actor attacking another nation state actor, + the prevalence of edge security middleboxes, and + that home routers or NICs nearby the clients could aggregate. +Any attacker that cannot account for these considerations is within our +threat model. + +\subsubsection{Security Notion} +To bypass our approach towards gossip an adaptive attacker may attempt to +actively probe the network for aggregating packet processors. This leads us to +the key security notion: + \emph{aggregation indistinguishability}. +An attacker should not be able to determine if a packet processor is aggregating +STHs. The importance of aggregation indistinguishability motivates the design of +our gossip mechanism into two distinct components: + aggregation that takes place inline at packet processors, and + periodic off\mbox{-}path log challenging that checks whether the observed STHs + are consistent. + +\subsection{Packet Processor Aggregation} \label{ctga:sec:aggregator} +An aggregating packet-processor determines for each packet if it is STH-related. +If so, the packet is cloned and sent to a challenging component for off-path +verification. +The exact definition of \emph{STH-related} depends on the plaintext source, but it +is ultimately the process of inspecting multiple packet headers such as +transport protocol and port number. It should be noted that the original packet +must not be dropped or modified. For example, an aggregator would have a +trivial aggregation distinguisher if it dropped any malformed STH. + +For each aggregating packet processor we have to take IP fragmentation into +consideration. Recall that IP fragmentation usually occurs when a packet is +larger than the MTU, splitting it into multiple smaller IP packets that are +reassembled at the destination host. Normally, an STH should not be fragmented +because it is much smaller than the de-facto minimum MTU of (at least) 576~% +bytes~\cite{min-mtu,ipv6}, but an attacker could use fragmentation to +\emph{intentionally} spread expected headers across multiple packets. +Assuming stateless packet processing, an aggregator cannot identify +such fragmented packets as STH-related because some header would be absent + (cf.\ stateless firewalls). +All tiny fragments should therefore be aggregated to account for intentional IP +fragmentation, which appears to have little or no impact on normal traffic +because tiny fragments are anomalies~\cite{frag-study-02}. The threat of +multi-path fragmentation is discussed in Section~\ref{ctga:sec:discussion:limitations}. + +Large traffic loads must also be taken into account. If an aggregating +packet processor degrades in performance as the portion of STH-related traffic +increases, a distant attacker may probe for such behaviour to determine if a +path contains an aggregator. Each \emph{implementation} must therefore be +evaluated individually for such behaviour, and if trivial aggregation +distinguishers exist this needs to be solved. For example, STH-related traffic +could be aggregated probabilistically to reduce the amount of work. +Another option is to load-balance the traffic before aggregation, i.e., avoid +worst-case loads that cannot be handled. + +\subsection{Off-Path Log Challenging} \label{ctga:sec:challenger} +A challenger is setup to listen for aggregated traffic, reassembling IP +fragments and storing the aggregated STHs for periodic off-path verification. +Periodic off\mbox{-}path verification means that the challenger challenges the log +based on its own (off-path fetched) STHs and the observed (aggregated) STHs to +verify log consistency periodically, e.g., every day. +The definition of \emph{off-path} is that the challenger must not be linkable to +its aggregating packet processor(s) or any other challenger (including itself). +Without an off-path there is no gossip step amongst aggregator-challenger +instances that are operated by different actors, and our approach towards gossip +would only assert that clients behind the same vantage point observe the same +logs. If a log cannot distinguish between different challengers due to the +use of off-paths, however, it is non-trivial to maintain a targeted split-view +towards an unknown location. Therefore, we get a form of \emph{implicit +gossip}~\cite{mpaudit} because at least one challenger would detect an +inconsistency unless everybody observes the same log. If every challenger +observes the same log, so does every client that is covered by an aggregating +packet processor. Notably the challenger component \emph{does not run inline} +to avoid timing distinguishers. Note that there are other important +considerations when implementing a challenger, as discussed in +Section~\ref{ctga:sec:discussion:limitations}. diff --git a/summary/src/ctga/src/discussion.tex b/summary/src/ctga/src/discussion.tex new file mode 100644 index 0000000..3a542d1 --- /dev/null +++ b/summary/src/ctga/src/discussion.tex @@ -0,0 +1,126 @@ +\section{Discussion} \label{ctga:sec:discussion} +Next we discuss assumptions, limitations and deployment, showing that +our approach towards retroactive gossip can be deployed to detect +split-views by many relevant attackers with relatively little effort. The +main drawback is reliance on clients fetching STHs in plaintext, e.g., using +CT-over-DNS~\cite{ct-over-dns}. + +\subsection{Assumptions and Limitations} \label{ctga:sec:discussion:limitations} +Aggregation-based gossip is limited to network traffic that packet processors +can observe. The strongest type of attacker in this setting---who can completely +isolate a client---trivially defeats our gossip mechanism and other retroactive +approaches in the literature (see Section~\ref{ctga:sec:related}). +A weaker attacker cannot isolate a client, but is located nearby in a network +path length sense. This limits the opportunity for packet processor aggregation, +but an attacker cannot rule it out given aggregation indistinguishability. +Section~\ref{ctga:sec:implementation} showed based on performance that it is non-% +trivial to distinguish between (non\mbox{-})aggregating packet processors on two +different targets using P4 and XDP. Off-path challengers must also be +indistinguishable from one another to achieve \emph{implicit gossip}. +While we suggested the use of anonymity networks like Tor, a prerequisite is +that this is in and of itself not an aggregation distinguisher. +Therefore, we assume that other entities also use off-paths to fetch and verify +STHs. The fact that a unique STH \emph{is not audited} from an off-path could +also be an aggregation distinguisher. To avoid this we could rely on a +verifiable STH history~\cite{ver-sth} +and wait until the next MMD to audit or simply monitor the full log so that +consistency proofs are unnecessary. + +The existence of multiple network paths are fundamental to the structure and +functioning of the Internet. A weak attacker may use IP fragmentation such that +each individual STH fragment is injected from a different location to make +aggregation harder, approaching the capabilities of a stronger attacker that +is located closer to the client. This is further exacerbated by the deployment +of multi-path transport protocols like MPTCP (which can also be fragmented). +Looking back at our RIPE Atlas measurements in Section~\ref{ctga:sec:measurements}, the +results towards Google's world-wide infrastructure better represent an active +attacker that takes \emph{some} measures to circumvent aggregation by +approaching a client nearby the edge. Given that the likelihood of aggregation +is high if \emph{any} IXP is present (Figure~\ref{ctga:fig:wcov}), aggregation at +well-connected IXPs are most likely to be circumvented. + +\subsection{Deployment} \label{ctga:sec:disussion:deployment} +Besides aggregating at strategic locations in the Internet's backbone, +ISPs and enterprise networks have the opportunity to protect all of their +clients with relatively little effort. Deployment of special-purpose middleboxes +are already prevalent in these environments, and then the inconvenience of +fragmentation tends to go away due to features such as packet reassembly. +Further, an attacker cannot trivially circumvent the edge of a network topology% +---especially not if aggregation takes place on an end-system: + all fragments are needed to reassemble a packet, which means that multi-path + fragmentation is no longer a threat. +If aggregation-based gossip is deployed on an end-system, STHs could be +hooked using other approaches than P4/XDP. For example, shim-layers that +intercept TLS certificates higher up in the networking stack were already +proposed by Bates~\emph{et~al.}~\cite{h1} and O'Neill~\emph{et~al.}~\cite{h2}. +In this setting, an end-system is viewed as the aggregating packet processor, +and it reports back to an off-path challenger that may be a local process +running on the same system or a remote entity, e.g., a TelCo could host +challengers that collect aggregated STHs from smartphones. + +While we looked at programming physical packet processors like routers, +STH aggregation could be approached in hypervisors and software +switches~\cite{pisces} to protect many virtual hosts. If CT-over-DNS is used to +fetch STHs, it would be promising to output DNS server caches to implement the +aggregation step. Similar to DNS servers, so called Tor exist relays also +operate DNS caches. In other words, P4 and XDP are only examples of how to +\emph{instantiate} the aggregation step. Depending on the used plaintext source, +packet processor, and network topology other approaches may be more suitable, +e.g., C for vendor-specific middleboxes. + +\subsection{Retroactive Gossip Benefits From Plaintext} +As opposed to an Internet core that only forwards IP packets, extra +functionality is often embedded which causes complex processing dependencies and +protocol ossification~\cite{TCPoss}. Many security and protocol issues were +found for middleboxes that provides extra +functionality~\cite{HTTPSintercept,langely-quic}, resulting in the mindset +that \emph{everything} should be encrypted~\cite{langely-quic}. +Our work is controversial because it goes against this mindset and advocates +that STHs should be communicated in plaintext. +We argue that this makes sense in the context of STHs due to the absence of +privacy concerns and because the entire point of gossip is to make STHs +\emph{available} (rather than end-to-end). +The idea of intentionally exposing information to the network is not new, e.g., +MPQUIC is designed to support traffic shaping~\cite{mpquic}. + +While we used CT-over-DNS as a plaintext source, there is a push towards +DNS-over-TLS~\cite{dot} and DNS-over-HTTPS~\cite{doh}. +Wide use of these approaches could undermine our gossip mechanism, but +ironically the security of TLS could be jeopardized unless gossip is deployed. +In other words, long term gossip is an essential component of CT and other +transparency logs to avoid becoming yet another class of trusted third-parties. +If proactive approaches such as witness cosigning are rejected in favour of +retroactive mechanisms, then ensuring that STHs are widely spread and easily +accessible is vital. An STH needs no secrecy if the appropriate measures are +taken to make it privacy-insensitive~\cite{ietf-gossip}. +While secure channels also provide integrity and replay protection, an STH is +already signed by logs and freshness is covered by MMDs, as well as issue +frequency to protect privacy. +A valid argument against exposing any plaintext to the network is protocol +ossification. We emphasize that our design motivates why packet processors +should fail open: + otherwise there is no aggregation indistinguishability. +Note that there are other plaintext sources than CT-over-DNS that could be +aggregated. However, if these sources require stream-reassembly it is +generally hard to process in languages such as P4 and XDP~\cite{ctga-thesis}. + +\subsection{Indistinguishability and Herd Immunity} \label{ctga:sec:discussion:herd} +An attacker that gains control over a CT log is bound to be more risk averse +than an attacker that compromises a CA. There is an order of magnitude fewer +logs than CAs, and client vendors are likely going to be exceptionally picky +when it comes to accepted and rejected logs. +We have already seen examples of this, including Google Chrome disqualifying +logs that made mistakes: + Izenpe used the same key for production and testing~\cite{izenpe}, and + Venafi suffered from an unfortunate power outage~\cite{venafi}. +Risk averse attackers combined with packet processors that are aggregation +indistinguishable may lead to \emph{herd immunity}: despite a significant +fraction of clients that lack aggregators, indirect protection may be provided +because the risk of eventual detection is unacceptable to many attackers. Hof +and Carle~\cite{hof-cross-logging} and Nordberg \emph{et~al.}~\cite{ietf-gossip} +discussed herd immunity~briefly~before~us. +While herd immunity is promising, it should be noted that aggregation +distinguishable packet processors at \emph{the edge of a network topology} may +be acceptable for some. In other words, if an aggregator cannot be circumvented +but it is detectable split-views would still be deterred against covered +clients if the challenger is off-path. diff --git a/summary/src/ctga/src/implementation.tex b/summary/src/ctga/src/implementation.tex new file mode 100644 index 0000000..9a35cfe --- /dev/null +++ b/summary/src/ctga/src/implementation.tex @@ -0,0 +1,82 @@ +\section{Distinguishability Experiments} \label{ctga:sec:implementation} +There are many different ways to implement the aggregation step. We decided to +use P4 and XDP because a large variety of programmable +packet processors support these languages (Section~\ref{ctga:sec:background:pdp}). +The aggregated plaintext source is assumed to be CT-over-DNS~\cite{ct-over-dns}, +which means that a client obtains STHs by fetching IN TXT resource records. +Since languages for programmable packet processors are somewhat restricted, +we facilitated packet processing by requiring that at most one STH is sent per +UDP packet. +This is reasonable because logs should only have one \emph{most recent} STH. +A DNS STH is roughly 170~bytes without any packet headers and +should normally not be fragmented, but to ensure that we do not miss any +intentionally fragmented STHs we aggregate every tiny fragment. We did not +implement the challenging component because it is relatively easy given +an existing off-path. Should any scalability issue arise for the challenger +there is nothing that prevents a distributed front-end that processes the +aggregated material before storage. Storage is not an issue because there are +only a limited amount of unique STHs per day and log + (one new STH per hour is a common policy, and browsers recognize $\approx 40$ + logs). +Further implementation details can be found +online~\cite{github,full-version}. + +\subsection{Setup} +We used a test-bed consisting of + a traffic generator, + a traffic receiver, and + an aggregating target in between. +The first target is a P4-enabled NetFPGA SUME board that runs an adapted version +of our P4 reference implementation. +The second target is a net-next kernel v4.17.0-rc6 Linux machine that runs XDP +on one core with + a 10~Gb SFP+ X520 82599ES Intel card, + a $3.6$~GHz Intel Core i7-4790 CPU, and + 16~GB of RAM at 1600~MHz (Hynix/Hyundai). +We would like to determine whether there are any aggregation distinguishers as +the fraction of STHs (experiment 1) and tiny fragments (experiment 2) in the +traffic is increased from 0--100\%, i.e., does performance degrade as a +function of STH-related rate? Non-fragmented STH packets are +411~bytes + (we used excessively large DNS headers to maximize the packet parsing + overhead), +and tiny fragments are 64~bytes. All background traffic have the same packet +sizes but is not deemed STH-related. + +\subsection{Results} +Figure~\ref{ctga:fig:perf-p4} shows throughput as a function of STH-related rate for +the P4-enabled NetFPGA. While we were unable to observe any distinguisher between +normal routing and the edge case of 100\% aggregation for +non-fragmented STH packets, there is a small constant throughput difference for +tiny fragments ($7.5$~Kbps). This is a non-negligible \emph{program +distinguisher} if a packet processor is physically isolated as in our benchmark, +i.e., something other than a routing program is running but it is not +necessarily an aggregator because performance does not degrade as a function +of increased STH-related rate. However, we found such degradation behaviour for the +single-core XDP case (Figure~\ref{ctga:fig:perf-xdp}). If line-speed is higher than +2~Gbps, STHs could be aggregated probabilistically or traffic could be load-% +balanced to \emph{overcome} this issue. +\begin{figure}[!t] + \centering + \begin{subfigure}[b]{.8\textwidth} + \includegraphics[width=\textwidth]{src/ctga/img/perf-netfpga} + \caption{P4 NetFPGA} + \label{ctga:fig:perf-p4} + \end{subfigure} + + \begin{subfigure}[b]{.8\textwidth} + \includegraphics[width=\textwidth]{src/ctga/img/perf-xdp} + \caption{XDP on a single core} + \label{ctga:fig:perf-xdp} + \end{subfigure} + \caption{% + Throughput as a function of STH-related traffic that is aggregated. + } + \label{ctga:fig:perf} +\end{figure} + +\subsection{Lessons Learned} +P4-NetFPGA provides aggregation indistinguishability regardless of STH load. +For XDP, it depends on the scenario: what is the line-rate criteria and how many +cores are available. For example, five cores support 10~Gbps aggregation +indistinguishability without probabilistic filtering or load balancing. diff --git a/summary/src/ctga/src/introduction.tex b/summary/src/ctga/src/introduction.tex new file mode 100644 index 0000000..248785e --- /dev/null +++ b/summary/src/ctga/src/introduction.tex @@ -0,0 +1,93 @@ +\section{Introduction} \label{ctga:sec:introduction} +The HyperText Transfer Protocol Secure (HTTPS) ecosystem is going through a paradigm shift. As opposed to blindly +trusting that Certificate Authorities (CAs) only issue certificates to the +rightful domain owners% + ---a model known for its weakest-link security~\cite{ca-ecosystem}---% +transparency into the set of issued certificates is incrementally being +required by major browser vendors~\cite{apple-ct,google-ct}. This transparency +is forced and takes the form of Certificate Transparency (CT) logs: + the idea is to reject any TLS certificate that have yet to be publicly logged, + such that domain owners can monitor the logs for client\mbox{-}accepted certificates + to \emph{detect} certificate mis-issuance \emph{after the fact}~\cite{ct}. +While the requirement of certificate logging is a significant improvement to the +HTTPS ecosystem, the underlying problem of trusting CAs cannot be solved by the +status quo of trusted CT logs (described further in +Section~\ref{ctga:sec:background:ct}). Therefore, it is paramount that nobody +needs to trust these logs once incremental deployments are matured. + +CT is formalized and cryptographically verifiable~\cite{ct-formal}, supporting +inclusion and consistency proofs. +This means that a client can verify whether a log is +operated correctly: + said certificates are included in the log, and + nothing is being removed or modified. +Despite the ability to cryptographically verify these two properties, there are +no assurances that everybody observes \emph{the same +log}~\cite{chuat-gossip,ct}. For example, certificate mis-issuance would +not be detected by a domain owner that monitors the logs if fraudulently issued +certificates are shown to the clients selectively. A log that serves different +versions of itself is said to present a \emph{split view}~\cite{ietf-gossip}. +Unless such log misbehaviour can be detected, we must trust it not to happen. + +The solution to the split viewing problem is a gossip mechanism which ensures +that everybody observes \emph{the same} consistent log~\cite{ct}. This +assumption is simple in theory but remarkably hard in practice due to + client privacy, + varying threat models, and + deployment challenges~\cite{ietf-gossip,cosi}. +While Google started on a package that supports + minimal gossip~\cite{minimal-gossip} and + the mechanisms of Nordberg \emph{et~al.}~\cite{ietf-gossip}, +there is ``next to no deployment in the wild''~\cite{little-or-no-gossip}. +To this end, we propose a gossip mechanism that helps detecting split-view +attacks retroactively based on the idea of packet processors, such as routers +and middleboxes, that \emph{aggregate} Signed Tree Heads (STHs)---succinct +representations of the logs' states---that are exposed to the network \emph{in +plaintext}. +The aggregated STHs are then used to challenge the logs to prove consistency +via an off-path, such that the logs cannot distinguish between challenges that +come from different aggregators. Given this indistinguishability assumption, it +is non-trivial to serve a consistent split-view to an unknown +location~\cite{mpaudit}. Thus, all aggregators must be on the same view, and +accordingly all clients that are covered by these aggregators must also be on +the same view \emph{despite not doing any explicit gossip themselves} because +gossip is provided as-a-service by the network. An isolated client (i.e., +untrusted network path to the aggregator) is notably beyond reach of any +retroactive gossip~\cite{cosi}. + +The premise of having STHs in plaintext is controversial given current trends to +encrypt transport protocols, which is otherwise an approach that combats +inspection of network traffic and protocol +ossification~\cite{HTTPSintercept,TCPoss}. We argue that keeping gossip +related material in plaintext to support aggregation-based gossip comes with few +downsides though: + it is easy to implement, + there are no major negative privacy impacts, and + it would offer significant protection for a large portion of the Internet + with a realistic threat model \emph{despite relatively small deployment + efforts}. +The three main limitations are + no protection against isolated clients, + reliance on clients that fetch STHs from the logs in plaintext, and + possible concerns surrounding protocol ossification~\cite{TCPoss}. +Our contributions are: +\begin{itemize} + \item Design and security considerations for a network-based gossip mechanism + that passively aggregates STHs to verify log consistency off-path + (Section~\ref{ctga:sec:design}). + \item Generic implementations of the aggregation step using P4~\cite{p4} and + XDP~\cite{xdp} for plaintext STHs, supporting line-speed packet + processing on systems that range from switches, routers, network interface + cards, and Linux (Section~\ref{ctga:sec:implementation}). + \item A simulation based on RIPE Atlas measurements that evaluate the impact + of deploying aggregation-based gossip at ASes and IXPs. Our evaluation shows + that incremental roll-out at well-connected locations would protect a + significant portion of all Internet clients from undetected split views + (Section~\ref{ctga:sec:measurements}). +\end{itemize} + +Besides the sections referenced above, the paper introduces necessary +background in Section~\ref{ctga:sec:background} and provides discussion, conclusion, +and future work in Sections~\ref{ctga:sec:related}--\ref{ctga:sec:conclusion}. +A full version with additional implementation details is available +online~\cite{full-version}. diff --git a/summary/src/ctga/src/measurements.tex b/summary/src/ctga/src/measurements.tex new file mode 100644 index 0000000..ee0ea89 --- /dev/null +++ b/summary/src/ctga/src/measurements.tex @@ -0,0 +1,85 @@ +\section{Estimated Impact of Deployment} \label{ctga:sec:measurements} +We conducted 20 daily traceroute measurements during spring 2018 on the RIPE +Atlas platform to evaluate the effectiveness of aggregation-based gossip. The +basic idea is to look at client coverage as central ASes and IXPs aggregate +STHs. If any significant client coverage can be achieved, the likelihood of +pulling off an undetected split-view will be small. + +\subsection{Setup} +We scheduled RIPE Atlas measurements from roughly 3500 unique ASes that +represent 40\% of the IPv4 space, trace-routing Google's authoritative +CT-over-DNS server and NORDUnet's CT log to simulate clients that fetch DNS STHs +in plaintext. Each traceroute result is a list of +traversed IPs, and it can be translated into the corresponding ASes and IXPs +using public data sets~\cite{pub-routeviews,pub-caida}. +In other words, traversed ASes and IXPs can be determined for each probe. Since +we are interested in client coverage as ASes and IXPs aggregate, each +probe is weighted by the IPv4 space of its AS. While an IP address is an +imperfect representation of a client, e.g., an IP may be unused or reused, it +gives a decent idea of how significant it is to cover a given probe. + +\subsection{Results} +Figure~\ref{ctga:fig:pl} shows AS/IXP path length and stability from the probes to +the targets. +If the AS path length is one, a single AS is traversed \emph{before reaching the +target}. It is evident that an AS path tends to be one hop longer +towards NORDUnet than Google because there is a rough off-by-one offset on the +x-axis. +A similar trend of greater path length towards NORDUnet can be observed for +IXPs. For example, + 74.0\% of all paths traversed no IXP towards Google, but + 58.5\% of all paths traversed a single IXP towards NORDUnet. +These results can be explained by infrastructural differences of our targets: + since Google is a worldwide actor an average path should be shorter than + compared to a region-restricted actor like NORDUnet. +We also observed that AS and IXP paths tend to be quite stable over 20~days + (the duration of our measurements). +I.e., if AS $a$ and $b$ are traversed it is unlikely to suddenly be +routed via AS~$c$. +\begin{figure}[!t] + \centering + \includegraphics[width=0.5\columnwidth]{src/ctga/img/pl}% + \includegraphics[width=0.5\columnwidth]{src/ctga/img/ps} + \caption{% + Path length and stability towards Google and NORDUnet. + } + \label{ctga:fig:pl} +\end{figure} + +Figure~\ref{ctga:fig:wcov} shows coverage of the RIPE Atlas network as $1...n$ actors +aggregate STHs. For example, 100\% and 50\% coverage means that at least 40\% +and 20\% of the full IPv4 space is covered. The aggregating ASes and IXPs were +selected based on the most commonly traversed vantage points in +our measurements (Pop), as well as CAIDA's largest AS ranking~\cite{caida}. +We found that more coverage is achieved when targeting +NORDUnet than Google. This is expected given that the paths tend to be longer. +If CAIDA's top-32 enabled aggregation, the coverage would be significant +towards Google (31.6\%)~and~NORDUnet~(58.1\%). + +\begin{figure}[!t] + \centering + \includegraphics[width=0.5\columnwidth]{src/ctga/img/wcov-goo}% + \includegraphics[width=0.5\columnwidth]{src/ctga/img/wcov-nor} + \caption{% + Coverage as a function of aggregation opt-in. + } + \label{ctga:fig:wcov} +\end{figure} + +\subsection{Lessons Learned} +A vast majority of all clients traverse \emph{at least} one AS that could +aggregate. It is relatively rare to traverse IXPs towards Google but not +NORDUnet. We also learned that paths tends to be stable, which means that the +time until split view detection would be at least 20 days \emph{if} it is +possible to find an unprotected client. This increases the importance of +aggregation indistinguishability. +Finally, we identified vantage points that are commonly traversed using Pop, and +these vantage points are represented well by CAIDA's independent AS ranking. +Little opt-in from ASes and IXPs provides significant coverage against an +attacker that is relatively close to a client + (cf.\ world-wide infrastructure of Google). +Although we got better coverage for NORDUnet, any weak attacker would approach +Google's coverage by renting infrastructure nearby. +Any weak attacker could also circumvent IXP aggregation by detecting the IXP +itself~\cite{ixp-detect}. As such, top-ranked AS aggregation should give +the best split-view protection. diff --git a/summary/src/ctga/src/ref.bib b/summary/src/ctga/src/ref.bib new file mode 100644 index 0000000..5aa2314 --- /dev/null +++ b/summary/src/ctga/src/ref.bib @@ -0,0 +1,573 @@ +@inproceedings{ixp-detect, + author = {George Nomikos and Xenofontas A. Dimitropoulos}, + title = {{traIXroute}: Detecting {IXPs} in traceroute paths}, + booktitle = {PAM}, + year = {2016}, +} + +@mastersthesis{ctga-thesis, + author = {Rasmus Dahlberg}, + title = {Aggregating {Certificate Transparency} Gossip Using Programmable Packet Processors}, + school = {Karlstad University}, + year = {2018}, + type = {Master Thesis}, +} + +@article{laurie, + author = {Ben Laurie}, + title = {{Certificate Transparency}}, + journal = {{ACM} Queue}, + volume = {12}, + number = {8}, + year = {2014}, +} + +@misc{minimal-gossip, + author = {David Drysdale}, + title = {Minimal Gossip}, + howpublished = {\url{https://github.com/google/certificate-transparency-go/blob/master/gossip/minimal}, accessed 2019-09-04}, +} + +@misc{google-ct, + author = {Devon O'Brien}, + title = {{Certificate Transparency} Enforcement in {Google Chrome}}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!msg/ct-policy/wHILiYf31DE/iMFmpMEkAQAJ}, accessed 2019-09-04}, +} + +@misc{apple-ct, + author = {Apple Inc.}, + title = {Apple's {Certificate Transparency} Policy}, + howpublished = {\url{https://support.apple.com/en-us/HT205280}, accessed 2019-09-04}, +} + +@inproceedings{h2, + author = {Mark O'Neill and + Scott Heidbrink and + Scott Ruoti and + Jordan Whitehead and + Dan Bunker and + Luke Dickinson and + Travis Hendershot and + Joshua Reynolds and + Kent E. Seamons and + Daniel Zappala}, + title = {{TrustBase}: An Architecture to Repair and Strengthen Certificate-Based Authentication}, + booktitle = {USENIX Security}, + year = {2017}, +} + +@inproceedings{h1, + author = {Adam Bates and + Joe Pletcher and + Tyler Nichols and + Braden Hollembaek and + Dave Tian and + Kevin R. B. Butler and + Abdulrahman Alkhelaifi}, + title = {Securing {SSL} Certificate Verification through Dynamic Linking}, + booktitle = {CCS}, + year = {2014}, +} + +@inproceedings{catena, + author = {Alin Tomescu and Srinivas Devadas}, + title = {Catena: Efficient Non-equivocation via {Bitcoin}}, + booktitle = {IEEE S\&P}, + year = {2017}, +} + +@inproceedings{ct-pir, + author = {Wouter Lueks and Ian Goldberg}, + title = {Sublinear Scaling for Multi-Client Private Information Retrieval}, + booktitle = {FC}, + year = {2015}, +} + +@techreport{ct, + author = {Ben Laurie and Adam Langley and Emilia Kasper}, + title = {{Certificate Transparency}}, + type = {RFC}, + institution = {IETF}, + number = {6962}, + year = {2013}, +} + +@misc{vds, + author = {Adam Eijdenberg and Ben Laurie and Al Cutter}, + title = {Verifiable Data Structures}, + howpublished = {\url{https://github.com/google/trillian/blob/master/docs/VerifiableDataStructures.pdf}, accessed 2019-09-04}, +} + +@inproceedings{transparency-overlays, + author = {Melissa Chase and Sarah Meiklejohn}, + title = {Transparency Overlays and Applications}, + booktitle = {CCS}, + year = {2016}, +} + +@inproceedings{coniks, + author = {Marcela S. Melara and + Aaron Blankstein and + Joseph Bonneau and + Edward W. Felten and + Michael J. Freedman}, + title = {{CONIKS}: Bringing Key Transparency to End Users}, + booktitle = {USENIX Security}, + year = {2015}, +} + +@inproceedings{ca-ecosystem, + author = {Zakir Durumeric and + James Kasten and + Michael Bailey and + J. Alex Halderman}, + title = {Analysis of the {HTTPS} Certificate Ecosystem}, + booktitle = {IMC}, + year = {2013}, +} + +@inproceedings{little-or-no-gossip, + author = {Oliver Gasser and + Benjamin Hof and + Max Helm and + Maciej Korczynski and + Ralph Holz and + Georg Carle}, + title = {In Log We Trust: Revealing Poor Security Practices with {Certificate Transparency} Logs and Internet Measurements}, + booktitle = {PAM}, + year = {2018}, +} + +@techreport{ietf-gossip, + author = {Linus Nordberg and Daniel Kahn Gillmor and Tom Ritter}, + title = {Gossiping in {CT}}, + number = {draft-ietf-trans-gossip-05}, + type = {Internet-draft}, + institution = {IETF}, + year = {2018}, +} + +@inproceedings{chuat-gossip, + author = {Laurent Chuat and + Pawel Szalachowski and + Adrian Perrig and + Ben Laurie and + Eran Messeri}, + title = {Efficient Gossip Protocols for Verifying the Consistency of + Certificate Logs}, + booktitle = {CNS}, + year = {2015}, +} + +@misc{ct-honey-bee, + author = {Andrew Ayer}, + title = {Lightweight Program that Pollinates {STHs} Between {Certificte + Transparency} Logs and Auditors}, + howpublished = {\url{https://github.com/SSLMate/ct-honeybee}, accessed 2019-09-04}, +} + +@article{hof-cross-logging, + author = {Benjamin Hof and Georg Carle}, + title = {Software Distribution Transparency and Auditability}, + journal = {CoRR}, + volume = {abs/1711.07278}, + year = {2017}, +} + +@techreport{ietf-cross-logging, + author = {Benjamin Hof}, + title = {{STH} Cross Logging}, + institution = {IETF}, + number = {draft-hof-trans-cross-00}, + type = {Internet-draft}, + year = {2017}, +} + +@misc{google-gossip, + author = {Ryan Sleevi and Eran Messeri}, + title = {{Certificate Transparency} in {Chrome}: Monitoring {CT} Logs consistency}, + howpublished = {\url{https://docs.google.com/document/d/1FP5J5Sfsg0OR9P4YT0q1dM02iavhi8ix1mZlZe_z-ls/edit?pref=2&pli=1}, accessed 2019-09-04}, +} + +@inproceedings{cosi, + author = {Ewa Syta and + Iulia Tamas and + Dylan Visher and + David Isaac Wolinsky and + Philipp Jovanovic and + Linus Gasser and + Nicolas Gailly and + Ismail Khoffi and + Bryan Ford}, + title = {Keeping Authorities ``Honest or Bust'' with Decentralized Witness Cosigning}, + booktitle = {IEEE S\&P}, + year = {2016}, +} + +@inproceedings{mpaudit, + author = {Lachlan J. Gunn and Andrew Allison and Derek Abbott}, + title = {Safety in Numbers: Anonymization Makes Keyservers Trustworthy}, + booktitle = {HotPETs}, + year = {2017}, +} + +@inproceedings{doublecheck, + author = {Mansoor Alicherry and Angelos D. Keromytis}, + title = {{DoubleCheck}: Multi-path Verification Against Man-in-the-Middle + Attacks}, + booktitle = {ISCC}, + year = {2009}, +} + +@inproceedings{perspectives, + author = {Dan Wendlandt and David G. Andersen and Adrian Perrig}, + title = {Perspectives: Improving {SSH}-Style Host Authentication with + Multi-Path Probing}, + booktitle = {USENIX ATC}, + year = {2008}, +} + +@misc{ct-over-dns, + author = {Ben Laurie}, + title = {{Certificate Transparency} Over {DNS}}, + howpublished = {\url{https://github.com/google/certificate-transparency-rfcs/blob/master/dns}, accessed 2019-09-04}, +} + +%%% Results: Characteristics of fragment traffic (§4c) +% Figures 12--13 show real-world measurements that __very__ few correct +% fragmentation series are less than 576 bytes. +% +% 93 series less than 256 bytes were observed, and all but two appeared to be +% errors. +% +% ``MTUs lower than 576 bytes are generally evidence of mistaken or misguided +% configuration''. +%%% +@article{frag-study-02, + author = {Colleen Shannon and David Moore and Kimberly C. Claffy}, + title = {Beyond Folklore: Observations on Fragmented Traffic}, + journal = {IEEE/ACM Trans.\ Netw.}, + volume = {10}, + number = {6}, + year = {2002}, +} + +%%% page 60 +% ``Since nearly all networks in the Internet currently support an MTU of 576 +% or greater, we strongly recommend the use of 576 for datagrams sent to +% non-local networks'' +%%% +@techreport{min-mtu, + author = {Robert Braden}, + title = {Requirements for {Internet} hosts---Communication Layers}, + institution = {IETF}, + type = {RFC}, + number = {1122}, + year = {1989}, +} + +@inproceedings{xdp, + author = {Toke H{\o}iland{-}J{\o}rgensen and + Jesper Dangaard Brouer and + Daniel Borkmann and + John Fastabend and + Tom Herbert and + David Ahern and + David Miller}, + title = {The eXpress data path: fast programmable packet processing in the operating system kernel}, + booktitle = {CoNEXT}, + year = {2018}, +} + +@inproceedings{TCPoss, + author = {Michio Honda and + Yoshifumi Nishida and + Costin Raiciu and + Adam Greenhalgh and + Mark Handley and + Hideyuki Tokuda}, + title = {Is it Still Possible to Extend {TCP}?}, + booktitle = {IMC}, + year = {2011}, +} + +@inproceedings{HTTPSintercept, + author = {Zakir Durumeric and + Zane Ma and + Drew Springall and + Richard Barnes and + Nick Sullivan and + Elie Bursztein and + Michael Bailey and + J. Alex Halderman and + Vern Paxson}, + title = {The Security Impact of {HTTPS} Interception}, + booktitle = {NDSS}, + year = {2017}, +} + +@inproceedings{mpquic, + author = {Quentin De Coninck and Olivier Bonaventure}, + title = {Multipath {QUIC}: Design and Evaluation}, + booktitle = {CoNEXT}, + year = {2017}, +} + +@inproceedings{tor, + author = {Roger Dingledine and Nick Mathewson and Paul F. Syverson}, + title = {Tor: The Second-Generation Onion Router}, + booktitle = {USENIX Security}, + year = {2004}, +} + +@inproceedings{androidlibs, + author = {Michael Backes and Sven Bugiel and Erik Derr}, + title = {Reliable Third-Party Library Detection in {Android} and its Security Applications}, + booktitle = {{CCS}}, + year = {2016}, +} + +@inproceedings{androidlibs2, + author = {Erik Derr and + Sven Bugiel and + Sascha Fahl and + Yasemin Acar and + Michael Backes}, + title = {Keep me Updated: An Empirical Study of Third-Party Library + Updatability on {Android}}, + booktitle = {CCS}, + year = {2017}, +} + +@inproceedings{langely-quic, + author = {Adam Langley and + Alistair Riddoch and + Alyssa Wilk and + Antonio Vicente and + Charles Krasic and + Dan Zhang and + Fan Yang and + Fedor Kouranov and + Ian Swett and + Janardhan R. Iyengar and + Jeff Bailey and + Jeremy Dorfman and + Jim Roskind and + Joanna Kulik and + Patrik Westin and + Raman Tenneti and + Robbie Shade and + Ryan Hamilton and + Victor Vasiliev and + Wan{-}Teh Chang and + Zhongyi Shi}, + title = {The {QUIC} Transport Protocol: Design and Internet-Scale Deployment}, + booktitle = {{SIGCOMM}}, + year = {2017}, +} + + +@article{sdn, + author = {Nick Feamster and + Jennifer Rexford and + Ellen W. Zegura}, + title = {The road to {SDN:} An intellectual history of programmable networks}, + journal = {CCR}, + volume = {44}, + number = {2}, + year = {2014}, +} + +@techreport{ipv6, + author = {Steve Deering and Robert Hinden}, + title = {Internet Protocol Version 6 ({IPv6}) specification}, + type = {RFC}, + institution = {IETF}, + number = {8200}, + year = {2017}, +} + +@inproceedings{ct-formal, + author = {Benjamin Dowling and + Felix G{\"{u}}nther and + Udyani Herath and + Douglas Stebila}, + title = {Secure Logging Schemes and {Certificate Transparency}}, + booktitle = {ESORICS}, + year = {2016}, +} + +@inproceedings{bpf, + author = {Steven McCanne and Van Jacobson}, + title = {The {BSD} Packet Filter: A New Architecture for User-level + Packet Capture}, + booktitle = {Usenix Winter Technical Conference}, + year = {1993}, +} + +@inproceedings{pisces, + author = {Muhammad Shahbaz and + Sean Choi and + Ben Pfaff and + Changhoon Kim and + Nick Feamster and + Nick McKeown and + Jennifer Rexford}, + title = {{PISCES:} A Programmable, Protocol-Independent Software Switch}, + booktitle = {{ACM} {SIGCOMM}}, + year = {2016}, +} + +@article{p4, + author = {Pat Bosshart and + Dan Daly and + Glen Gibb and + Martin Izzard and + Nick McKeown and + Jennifer Rexford and + Cole Schlesinger and + Dan Talayco and + Amin Vahdat and + George Varghese and + David Walker}, + title = {{P4}: Programming Protocol-independent Packet Processors}, + journal = {CCR}, + volume = {44}, + number = {3}, + year = {2014}, +} + +@inproceedings{rmt, + author = {Pat Bosshart and + Glen Gibb and + Hun{-}Seok Kim and + George Varghese and + Nick McKeown and + Martin Izzard and + Fernando A. Mujica and + Mark Horowitz}, + title = {Forwarding Metamorphosis: Fast Programmable Match-action Processing in + Hardware for {SDN}}, + booktitle = {ACM SIGCOMM}, + year = {2013}, +} + +@conference{p4netfpga, + author = {Gordon Brebner}, + title = {{P4} for an {FPGA} Target}, + booktitle = {P4 Workshop}, + year = {2015}, + note = {\url{https://p4workshop2015.sched.com/event/3ZQA/p4-for-an-fpga-target}, accessed 2019-09-04}, +} + +@misc{p4netronome, + title = {Programming {NFP} with {P4} and {C}}, + howpublished = {\url{https://www.netronome.com/media/redactor_files/WP_Programming_with_P4_and_C.pdf}, accessed 2019-09-04}, +} + +@misc{flexpipe, + title = {Intel Ethernet Switch {FM600} Series: 10/40 {GbE} Low Latency Switching Silicon}, + howpublished = {\url{https://www.intel.com/content/dam/www/public/us/en/documents/product-briefs/ethernet-switch-fm6000-series-brief.pdf}, accessed 2019-09-04}, +} + +@misc{cavium, + title = {Cavium and {XPliant} Introduce a Fully Programmable Switch Silicon Family Scaling to 3.2 Terabits per Second}, + howpublished = {\url{https://cavium.com/newsevents-cavium-and-xpliant-introduce-a-fully-programmable-switch-silicon-family.html}, accessed 2019-09-04}, +} + +@misc{barefoot, + title = {Tofino: World's fastest {P4}-programmable Ethernet switch {ASICs}}, + howpublished = {\url{https://barefootnetworks.com/products/brief-tofino/}, accessed 2019-09-04}, +} + +@misc{p4bm, + title = {BEHAVIORAL MODEL REPOSITORY}, + howpublished = {\url{https://github.com/p4lang/behavioral-model}, accessed 2019-09-04}, +} + +@misc{p42ebpf, + author = {Mihai Budiu}, + title = {Compiling {P4} to {eBPF}}, + howpublished = {\url{https://github.com/iovisor/bcc/tree/master/src/cc/frontends/p4}, accessed 2019-09-04}, +} + +@misc{fbi-apple, + author = {EFF}, + title = {Apple Challenges {FBI}: All Writs Act Order ({CA})}, + howpublished = {\url{https://www.eff.org/cases/apple-challenges-fbi-all-writs-act-order}, accessed 2019-09-04}, +} + +@misc{ver-sth, + author = {Linus Nordberg}, + title = {{Re: [Trans] Providing} the history of {STHs} a log has issued (in 6962-bis)}, + howpublished = {\url{https://mailarchive.ietf.org/arch/msg/trans/JbFiwO90PjcYzXrEgh-Y7bFG5Fw}, accessed 2019-09-04}, +} + +@techreport{dot, + author = {Sara Dickinson and Dan Gillmor and Tirumaleswar Reddy}, + title = {Usage Profiles for {DNS} over {TLS} and {DNS} over {DTLS}}, + type = {RFC}, + institution = {IETF}, + number = {8310}, + year = {2016}, +} + +@techreport{doh, + author = {Paul Hoffman and Patrick McManus}, + title = {{DNS} Queries over {HTTPS (DoH)}}, + type = {RFC}, + institution = {IETF}, + number = {8484}, + year = {2018}, +} + +@misc{izenpe, + author = {Ryan Sleevi}, + title = {Upcoming {CT} Log Removal: {Izenpe}}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/qOorKuhL1vA}, accessed 2019-09-04}, +} + +@misc{venafi, + author = {Ryan Sleevi}, + title = {Upcoming Log Removal: {Venafi CT} Log Server}, + note = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/KMAcNT3asTQ}, accessed 2019-09-04}, +} + +@misc{caida, + author = {CAIDA}, + title = {{ARank}}, + note = {\url{http://as-rank.caida.org/}, accessed 2019-09-04}, +} + +@article{full-version, + author = {Rasmus Dahlberg and + Tobias Pulls and + Jonathan Vestin and + Toke H{\o}iland{-}J{\o}rgensen and + Andreas Kassler}, + title = {Aggregation-Based Gossip for Certificate Transparency}, + journal = {CoRR}, + volume = {abs/1806.08817}, + year = {2019}, +} + +@misc{pub-caida, + author = {CAIDA}, + title = {The {CAIDA UCSD IXPs} Dataset}, + howpublished = {\url{https://www.caida.org/data/ixps/}, accessed 2019-09-04}, + month = {February}, + year = {2018}, +} + +@misc{pub-routeviews, + title = {The {Routeviews MRT format RIBs and UPDATEs} Dataset}, + howpublished = {\url{http://archive.routeviews.org/bgpdata/2018.03/RIBS/}, accessed 2019-09-04}, + month = {March}, + year = {2018}, +} + +@misc{github, + title = {Paper artifact}, + year = {2018}, + howpublished = {\url{https://github.com/rgdd/ctga}}, +} diff --git a/summary/src/ctga/src/related.tex b/summary/src/ctga/src/related.tex new file mode 100644 index 0000000..15d7fad --- /dev/null +++ b/summary/src/ctga/src/related.tex @@ -0,0 +1,67 @@ +\section{Related Work} \label{ctga:sec:related} +Earlier approaches towards CT gossip are categorized as \emph{proactive} or +\emph{retroactive} in Figure~\ref{ctga:fig:related}. We consider an approach proactive +if gossip takes place \emph{before} SCTs and/or STHs reach the broader audience +of clients. +Syta \emph{et~al.} proposed proactive witness cosigning, in which an STH is +collectively signed by a \emph{large} number of witnesses and at most a fraction +of those can be faulty to ensure that a benevolent witness observed an +STH~\cite{cosi}. STH +cross-logging~\cite{minimal-gossip,ietf-cross-logging,hof-cross-logging} +is similar in that an STH must be proactively disclosed in +another transparency log to be trusted, avoiding any additional cosigning +infrastructure at the cost of reducing the size and diversity of the witnessing +group. +Tomescu and Devadas~\cite{catena} suggested a similar cross-logging scheme, +but split-view detection is instead reduced to the difficulty of forking the +Bitcoin blockchain + (big-O cost of downloading all block headers as a TLS client). +The final proactive approach is STH pushing, where a trusted third-party +pushes the same verified STH history to a base of clients~\cite{google-gossip}. +\begin{figure}[!t] + \centering + \input{src/ctga/img/related.tex} + \caption{% + A categorization of approaches towards CT gossip. + } + \label{ctga:fig:related} +\end{figure} + +We consider a gossip mechanism retroactive if gossip takes place \emph{after} +SCTs and/or STHs reach the broader audience of clients. +Chuat \emph{et~al.} proposed that TLS clients and TLS servers be modified to +pool exchanged STHs and relevant consistency proofs~\cite{chuat-gossip}. +Nordberg \emph{et~al.} continued this line of work, suggesting privacy-% +preserving client-server pollination of fresh STHs~\cite{ietf-gossip}. Nordberg +\emph{et~al.} also proposed that clients feedback SCTs and certificate chains on +every server revisit, and that trusted auditor relationships could be engaged if +privacy need not be protected. +The latter is somewhat similar to the formalized client-monitor gossip of +Chase and Meiklejohn~\cite{transparency-overlays}, as well as the CT honey bee +project where a client process fetches and submits STHs to a pre-% +compiled list of auditors~\cite{ct-honey-bee}. +Laurie suggested that a client can resolve privacy-sensitive SCTs to privacy-% +insensitive STHs via DNS (which are easier to gossip)~\cite{ct-over-dns}. +Private information retrievals could likely achieve something similar~\cite{ct-pir}. +Assuming that TLS clients are indistinguishable from one another, split-view +detection could also be implicit as proposed by Gunn \emph{et~al.} for the +verifiable key-value store CONIKS~\cite{mpaudit,coniks}. + +Given that aggregation-based gossip takes place after an STH is issued, it is a +retroactive approach. As such, we cannot protect an isolated client from split-% +views~\cite{cosi}. Similar to STH pooling and STH pollination, we rely on +client-driven communication and an existing infrastructure of packet processors +to aggregate. +Our off-path verification is +based on the same multi-path probing and indistinguishability assumptions as +Gunn \emph{et~al.}~\cite{doublecheck,mpaudit,perspectives}. Further, given that +aggregation is application neutral and deployable on hosts, it could provide +gossip \emph{for} the CT honey bee project (assuming plaintext STHs) and any +other transparency application like Trillian~\cite{vds}. Another benefit when +compared to browsing-centric and vendor-specific approaches is that a plethora +of HTTPS clients are covered, ranging from niche web browsers to command line +tools and embedded libraries that are vital to protect but yet lack the +resources of major browser vendors~\cite{androidlibs,androidlibs2}. +Our approach coexists well with witness cosigning and cross-logging due to +different threat models, but not necessarily STH pushing if the secure +channel is encrypted (no need to fetch what a trusted party provides). diff --git a/summary/src/ctor/.gitignore b/summary/src/ctor/.gitignore new file mode 100644 index 0000000..8bb88c8 --- /dev/null +++ b/summary/src/ctor/.gitignore @@ -0,0 +1,9 @@ +main.pdf +*.blg +*.bbl +*.fls +*.fdb_latexmk +*.log +*.out +*.aux +*.swp diff --git a/summary/src/ctor/img/design-full.pdf b/summary/src/ctor/img/design-full.pdf new file mode 100644 index 0000000..5602116 Binary files /dev/null and b/summary/src/ctor/img/design-full.pdf differ diff --git a/summary/src/ctor/img/design-incremental.pdf b/summary/src/ctor/img/design-incremental.pdf new file mode 100644 index 0000000..7c7160d Binary files /dev/null and b/summary/src/ctor/img/design-incremental.pdf differ diff --git a/summary/src/ctor/main.tex b/summary/src/ctor/main.tex new file mode 100644 index 0000000..ac4b505 --- /dev/null +++ b/summary/src/ctor/main.tex @@ -0,0 +1,72 @@ +\begin{kaupaper}[ + author={% + \textbf{Rasmus Dahlberg}, + Tobias Pulls, + Tom Ritter, and + Paul Syverson + }, + title={% + Privacy-Preserving \& Incrementally-Deployable Support for Certificate Transparency in Tor + }, + reference={% + PETS (2021) + }, + summary={% + One deployment challenge of Certificate Transparency is to ensure that + monitors and end-users are engaged in gossip-audit protocols. This is + particularly difficult for end-users because such engagement can harm + privacy. For example, verifying that a certificate is included by + fetching an inclusion proof from a log reveals which website was visited. + We propose a gradual roll-out of Certificate Transparency in Tor Browser + that preserves privacy \emph{due to} and \emph{how we use} the anonymity + network Tor. The complete design holds log operators accountable for + certificates they promise to append by having Tor relays fetch inclusion + proofs against the same view agreed upon by directory authorities in Tor's + consensus. Found issues (if any) are reported to trusted auditors. The + incremental design side-steps much of the practical deployment effort by + replacing the audit-report pattern with cross-logging of certificates in + independent logs, thus assuming that at least one log is honest as opposed + to no log in the complete design. All Tor Browser needs to do is verify + log signatures and then submit the encountered certificates to randomly + selected Tor relays. Such submissions are probabilistic to balance + performance against the risk of eventual detection of log misbehavior. + Processing of the submitted certificates is also randomized to reduce + leakage of real-time browsing patterns, something Tor Browser cannot do on + its own due to criteria like disk avoidance and the threat model for + wanting Certificate Transparency in the first place. We provide a + security sketch and estimate performance overhead based on Internet + measurements. + }, + participation={\vspace{-.25cm} + I had the initial idea and was the main driver to move the work forward, + first in discussion with Tobias and then together with Tom and Paul. + }, + label={ + paper:ctor + }, +] + \maketitle + \begin{abstract} + \input{src/ctor/src/abstract} + \end{abstract} + + \input{src/ctor/src/introduction} + \input{src/ctor/src/background} + \input{src/ctor/src/adversary} + \input{src/ctor/src/design} + \input{src/ctor/src/analysis} + \input{src/ctor/src/cross-logging} + \input{src/ctor/src/performance} + \input{src/ctor/src/privacy} + \input{src/ctor/src/related} + \input{src/ctor/src/conclusion} + + \input{src/ctor/src/acknowledgements} + + \bibliographystyle{plain} + \bibliography{src/ctor/src/ref} + + \begin{appendices} + \input{src/ctor/src/appendix} + \end{appendices} +\end{kaupaper} diff --git a/summary/src/ctor/src/abstract.tex b/summary/src/ctor/src/abstract.tex new file mode 100644 index 0000000..718c939 --- /dev/null +++ b/summary/src/ctor/src/abstract.tex @@ -0,0 +1,30 @@ +\noindent +The security of the web improved greatly throughout the last couple of years. +A large majority of the web is now served encrypted as part of HTTPS, and +web browsers accordingly moved from positive to negative security indicators +that warn the user if a connection is insecure. A secure connection requires +that the server presents a valid certificate that binds the domain name in +question to a public key. A certificate used to be valid if signed by a trusted +Certificate Authority (CA), but web browsers like Google Chrome and +Apple's Safari have additionally started to mandate Certificate Transparency (CT) +logging to overcome the weakest-link security of the CA ecosystem. Tor and the +Firefox-based Tor Browser have yet to enforce CT. + +We present privacy-preserving and incrementally-deployable +designs that add support for CT in Tor. Our designs go beyond the currently +deployed CT enforcements that are based on blind trust: + if a user that uses Tor Browser is man-in-the-middled over HTTPS, + we probabilistically detect and disclose cryptographic evidence of CA and/or + CT log misbehavior. +The first design increment allows Tor to play a vital role in the overall goal +of CT: + detect mis-issued certificates and hold CAs accountable. +We achieve this by randomly cross-logging a subset of certificates into other CT +logs. The final increments hold misbehaving CT logs accountable, initially +assuming that some logs are benign and then without any such assumption. +Given that the current CT deployment lacks strong mechanisms to verify if log +operators play by the rules, exposing misbehavior is important for the web in +general and not just Tor. The full design turns Tor into a system for +maintaining a probabilistically-verified view of the CT log ecosystem available +from Tor's consensus. Each increment leading up to it preserves privacy due to +and how we use Tor. diff --git a/summary/src/ctor/src/acknowledgements.tex b/summary/src/ctor/src/acknowledgements.tex new file mode 100644 index 0000000..3bd9f48 --- /dev/null +++ b/summary/src/ctor/src/acknowledgements.tex @@ -0,0 +1,7 @@ +\section*{Acknowledgements} +We would like to thank our anonymous reviewers as well as Linus Nordberg and +Eric Rescorla for their valuable feedback. +Rasmus Dahlberg was supported by the Knowledge Foundation of Sweden and the +Swedish Foundation for Strategic Research, +Tobias Pulls by the Swedish Internet Foundation, and +Paul Syverson by the U.S.\ Office of Naval Research (ONR). diff --git a/summary/src/ctor/src/adversary.tex b/summary/src/ctor/src/adversary.tex new file mode 100644 index 0000000..a17fd31 --- /dev/null +++ b/summary/src/ctor/src/adversary.tex @@ -0,0 +1,76 @@ +\section{Threat Model} \label{ctor:sec:adversary} +We consider a strong attacker who is targeting all or a subset of users visiting +a particular website over Tor. It is generally difficult to perform a targeted +attack on a single particular Tor user because one needs to identify the user's +connection before performing the attack---something that Tor's +anonymity properties frustrate. +However, it is not difficult to perform an attack on all or a subset of unknown +users of a particular service. A network vantage point to perform such an attack +is easily obtained by operating an exit relay (for a subset of Tor users) or by +compromising the network path of multiple exit relays or the final destination. +Once so positioned, the encrypted network traffic can be intercepted using a +fraudulent certificate and associated SCTs. The subsequent attack on decrypted +network traffic +may be passive (to gather user credentials or other information) or active. +Typical examples of active attacks are to change cryptocurrency addresses to +redirect funds to the attacker or to serve an exploit to the user's browser for +\emph{user deanonymization}. Without the ability to intercept encrypted traffic, +these attacks become more difficult as the web moves towards deprecating +plaintext HTTP. + +All of the components of such an attack have been seen in-the-wild +numerous times. Untargeted attacks on visitors of a particular website +include Syria's interception of Facebook traffic using a self-signed +512-bit RSA key in ~2011~\cite{syria-facebook-mitm}, Iran's +interception of Bing and Google traffic using the DigiNotar +CA~\cite{ct/a,diginotar}, and the 2018 MyEtherWallet +self-signed certificate that was used as part of a BGP +hijack~\cite{ethereum-hijack-isoc}. The latter is also an example of +redirecting routing as part of an attack (either suspected or +confirmed). Other examples of this are Iran hijacking prefixes of +Telegram (an encrypted messaging application) in +2018~\cite{iran-telegram-bgp}, another attack on cryptocurrency in +2014 this time targeting unencrypted mining +traffic~\cite{bgp-hijacking-for-crypto}, +and hijacks that may have been intelligence-gathering (or honest +mistakes) including hijacks by Russian ISPs in 2017 and China Telecom +in 2018 and 2019~\cite{wiki-bgp}. Finally, there are several examples of +law enforcement serving exploits to Tor Browser users to de-anonymize and +subsequently arrest individuals~\cite{forbes-fbi-tor,doj-fbi-tor}. + +With +the attacker's profile in mind, we consider someone that controls + a CA, + enough CT logs to pass Tor Browser's SCT-centric CT policy, + some Tor clients, and + a fraction of Tor relays. +For example, it is possible to + issue certificates and SCTs, + dishonor promises of public logging, + present split-views at will, + intercept and delay traffic from controlled exit relays as well as CT logs, + and + be partially present in the network. +This includes a weaker attacker that does not \emph{control} CAs and CT logs, +but who \emph{gained access} to the relevant signing keys~\cite{turktrust,% +gdca1-omission}. A modest fraction of CTor entities can be subject to DoS, but +not everyone at once and all the time. In other words, we consider the threat +model of Tor and Tor Browser as a starting point~\cite{tor,tor-browser}. Any +attacker that can reliably disrupt CT and/or Tor well beyond Tor's threat +model is therefore not within ours. + +Given that we are in the business of enforcing CT, the attacker needs to hide +mis-issued certificates and SCTs from entities that audit the CT log ecosystem. +As described in Section~\ref{ctor:sec:background:ct}, this can either be achieved by +omission or split-view attacks. Our intended attacker is clearly powerful and +may successfully issue a certificate chain and associated SCTs without detection +some of the time, but a CA caught in mis-issuance or a CT log that violated an +MMD promise will no longer be regarded as trusted. Therefore, we assume a +\emph{risk-averse} attacker that above a relatively low probability of detection +would be deterred from engaging in such activities. Note that the goal of +\emph{detection} is inherited from CT's threat model, which aims to remedy +certificate mis-issuance \emph{after the fact}; not prevent it~\cite{ct/a}. + +We identify and analyze specific attack vectors that follow from our threat +model and design as part of the security analysis in Section~\ref{ctor:sec:analysis}, +namely, attack vectors related to timing as well as relay flooding and tagging. diff --git a/summary/src/ctor/src/analysis.tex b/summary/src/ctor/src/analysis.tex new file mode 100644 index 0000000..4bbc4c3 --- /dev/null +++ b/summary/src/ctor/src/analysis.tex @@ -0,0 +1,173 @@ +\section{Security Analysis} \label{ctor:sec:analysis} +We consider four types of impact for an attacker that conducted +HTTPS-based man-in-the-middle attacks on Tor Browser. Other than \emph{none}, +these impact types are: +\begin{description} + \item[Minor] the attack was detected due to some cover-up that involved + network-wide actions against CTor. This is likely hard to attribute to + the actual attacker, but nevertheless it draws much unwanted attention. + \item[Significant] the attack generated public cryptographic evidence + that proves CA misbehavior. + \item[Catastrophic] the attack generated public cryptographic evidence + that proves CT log misbehavior. +\end{description} + +Our design leads to significant and catastrophic impact events, but does +unfortunately not preclude minor ones. It is possible to overcome this +shortcoming at different trade-offs, e.g., by tuning CTor parameters reactively +(phase~2 below) or relying on different trust assumptions as in the +incremental cross-logging designs (Section~\ref{ctor:sec:incremental}). + +\textbf{Probability of Detection.} +Suppose the attacker mis-issued a certificate that Tor Browser trusts, and that +it is considered valid because it is accompanied by enough SCTs from CT logs +that the attacker controls. The resulting SFO is then used to man-in-the-middle +a single Tor Browser user, i.e., for the purpose of our analysis we consider +\emph{the most risk-averse scenario possible}. Clearly, none of the attacker's +CT logs plan to keep any promise of public logging: + that would trivially imply significant impact events. +The risk of exposure is instead bound by the probability that \emph{any} of the +four phases in our design fail to propagate the mis-issued SFO to a pinned CT +auditor that is benign. + +\textbf{Phase~1: Submission.} +The probability of detection cannot exceed the probability of submission +(\texttt{ct-submit-pr}). We analyze the outcome of submitting the mis-issued +SFO from Tor Browser to a CTR\@. There are two cases to consider, namely, the +mis-issued SFO is either larger than \texttt{ct-large-sfo-size} or it is not. + +If the SFO is larger than \texttt{ct-large-sfo-size}, Tor Browser blocks until +the SFO is submitted and its CT circuit is closed. As such, it is impossible to +serve a Tor Browser exploit reactively over the man-in-the-middled connection +that shuts-down the submission procedure before it occurs. Assuming that +forensic traces in tor and Tor Browser are unreliable,\footnote{% + ``tor'' (aka ``little-t tor'') is the tor process Tor Browser uses to + interact with the Tor network. On marking a circuit as closed in tor, tor + immediately schedules the associated data structures to be freed as soon as + possible. +} the sampled CTR identity also cannot be revealed with high certainty +afterwards by compromising Tor Browser. The attacker may know that the SFO is +buffered by \emph{some CTR} based on timing, i.e., blocking-behavior could be +measurable and distinct. The important part is not to reveal \emph{which CTR} +received a submission: a single Tor relay may be subject to DoS. + +If the SFO is smaller or equal to \texttt{ct-large-sfo-size} there is a +race between (i) the time it takes for Tor Browser to submit the SFO and close +its CT circuit against (ii) the time it takes for the attacker to compromise Tor +Browser and identify the CTR in question. It is more advantageous to try and +win this race rather than being in the unfruitful scenario above. Therefore, +the attacker would maximize the time it takes to perform (i) by sending an SFO +that is \texttt{ct-large-sfo-size}. Our design reduced the threat of an +attacker that wins this race by using pre-built CT circuits that are closed +immediately after use. This makes the attack surface \emph{narrow}, limiting +the number of reliable exploits (if any). + +Note that the attack surface could, in theory, be eliminated by setting +\texttt{ct-large-sfo-size} to zero. However, that is likely too costly in +terms of latency~\cite{no-hard-fail}. + +\textbf{Phase~2: Buffering.} +The probability of detection cannot exceed $1-(f_{\mathsf{ctr}} + +f_{\mathsf{dos}})$, where $f_{\mathsf{ctr}}$ is the fraction of +malicious CTRs and $f_{\mathsf{dos}}$ the fraction of CTRs that suffer from +DoS. We analyze the outcome of SFO reception at a genuine CTR\@. + +The time that an SFO is buffered depends on if the log's MMD elapsed or not. +The earliest point in time that a newly issued SCT can be audited (and the log +is expected to respond) is an MMD later, whereas the normal buffer time is +otherwise only governed by smaller randomness in the \texttt{audit\_after} +timestamp (minutes). A rational attacker would therefore maximize the buffer +time by using a newly issued SCT, resulting in an attack window that is \emph{at +least} 24~hours for today's CT logs~\cite{google-log-policy}. + +Following from Tor's threat model, the mis-issued SFO must be stored in volatile +memory and not to disk. Two risks emerge due to large buffer times: + the CTR in question might be restarted by the operator independently of the + attacker's mis-issued SFO being buffered, + and given enough time the attacker might find a way to cause the evidence to + be deleted. +While a risk-averse attacker cannot rely on the former to avoid detection, we +emphasize that the CTR criteria must include the \texttt{stable} flag to reduce +the probability of this occurring. + +The latter is more difficult to evaluate. It depends on the attacker's +knowledge as well as capabilities. Phase~1 ensured that the attacker \emph{does +not know which CTR to target}. As such, any attempt to intervene needs to +target all CTRs. While a network-wide DoS against Tor would be effective, it is +not within our threat model. A less intrusive type of DoS would be to +\emph{flood} CTRs by submitting massive amounts of SFOs: just enough to make +memory a scarce resource, but without making Tor unavailable. This could +potentially \emph{flush} a target SFO from the CTR's finite memory, following +from the delete-at-random strategy in Section~\ref{ctor:sec:base:phase2}. Assuming +that a CTR has at most 1~GiB of memory available for SFOs (conservative and in +favour of the attacker), Appendix~\ref{ctor:app:flush} shows that the attacker's +flood must involve at least $2.3$~GiB per CTR to accomplish a 90\% success +certainty. This means that it takes $7.9$--$39.3$~minutes if the relay +bandwidth is between 8--40~Mbps. So it is impractical to flush all CTRs within +a few minutes, and hours are needed not to make everyone unavailable at once. + +The CTR criteria set in Section~\ref{ctor:sec:base:consensus} matches over +4000 Tor relays~\cite{relay-by-flag}. A network-wide flush that succeeds with +90\% certainty therefore involves 8.99~TiB. It might sound daunting at first, +but distributed throughout an entire day it only requires 0.91~Gbps. Such an +attack is within our threat model because it does not make Tor unavailable. +Notably the ballpark of these numbers do not change to any significant degree by +assuming larger success probabilities, e.g., a 99\% probability only doubles the +overhead. Further, the needed bandwidth scales linearly with the assumed memory +of CTRs. This makes it difficult to rely on the finite volatile memory of CTRs +to mitigate network-wide flushes. As described in +Section~\ref{ctor:sec:base:phase2}, we ensure that flushes are \emph{detected} by +publishing the number of received and deleted SFO bytes throughout different +time intervals as extra-info. + +Once detected, there are several possible \emph{reactions} that decrease the +likelihood of a minor impact scenario. For example, Tor's directory +authorities could lower MMDs to, say, 30~minutes, so that the SFO is reported to +an auditor before it is flushed with high probability. This has the benefit of +implying significant impact because the mis-issued certificate is detected, but +also the drawback of allowing the logs to merge the certificate before there is +any MMD violation to speak of. The most appropriate response depends on the +exact attack scenario and which trade-offs one is willing to accept. + +\textbf{Phase~3: Auditing.} +By the time an SFO enters the audit phase, the log in question is expected to +respond with a valid inclusion proof. There is no such proof if the log +violated its MMD, and it is too late to create a split-view that merged the +certificate in time because the CTR's view is already fixed by an STH in the +Tor consensus that captured the log's misbehavior. In fact, creating any +split-view within Tor is impractical because it requires that the consensus is +forged or that nobody ever checks whether the trusted STHs are consistent. +This leaves two options: + the attacker either responds to the query with an invalid inclusion proof or + not at all. +The former is immediately detected and starts phase~4, whereas the latter forces +the CTR to wait for \texttt{ct-watchdog-timeout} to trigger (which is a +few seconds to avoid premature auditor reports). A rational attacker prefers +the second option to gain time. + +Clearly, the attacker knows that \emph{some} CTR holds evidence of log +misbehavior as it is being audited. The relevant question is whether the +\emph{exact CTR identity} can be inferred, in which case the attacker could +knock it offline (DoS). Motivated by the threat of \emph{tagging}, where the +attacker sends unique SFOs to all CTRs so that their identities are revealed +once queried for, we erred on the safe side and built watchdogs into our design: +it is already too late to DoS the querying CTR because the evidence is already +replicated somewhere else, ready to be reported unless there is a timely +acknowledgement. The attacker would have to \emph{break into an arbitrary CTR +within seconds} to cancel the watchdog, which cannot be identified later on +(same premise as the sampled CTR in phase~1). Such an attacker is not in Tor's +threat model. + +\textbf{Phase~4: Reporting.} +At this stage the process of reporting the mis-issued SFO to a random CT auditor +is initiated. Clearly, the probability of detection cannot exceed +$1-f_{\mathsf{auditor}}$, where $f_{\mathsf{auditor}}$ is the fraction of +malicious CT auditors. Fixating the sampled CT auditor is important to avoid +the threat of an eventually successful report only if it is destined to the +attacker's auditor because our attacker is partially present in the network. +Gaining time at this stage is of limited help because the CTR identity is +unknown as noted above, and it remains the +case throughout phase~4 due to reporting on independent Tor circuits (and +independently of if other SFO reports succeeded or not). Without an +identifiable watchdog, the attacker needs a network-wide attack that is already +more likely to succeed in the buffer phase. diff --git a/summary/src/ctor/src/appendix.tex b/summary/src/ctor/src/appendix.tex new file mode 100644 index 0000000..23e285f --- /dev/null +++ b/summary/src/ctor/src/appendix.tex @@ -0,0 +1,117 @@ +\section{Detailed Consensus Parameters} \label{ctor:app:consensus-params} + +Below, the value of an item is computed as the median of all votes. +\begin{description} + \item[ct-submit-pr:] A floating-point in $[0,1]$ that determines Tor + Browser's submission probability. For example, $0$ disables submissions + while $0.10$ means that every 10$^{\mathsf{th}}$ SFO is sent to a random + CTR on average. + \item[ct-large-sfo-size:] A natural number that determines how many + wire-bytes a normal SFO should not exceed. As outlined in + Section~\ref{ctor:sec:base:phase1}, excessively large SFOs are subject to + stricter verification criteria. + \item[ct-log-timeout:] A natural number that determines how long a CTR waits + before concluding that a CT log is unresponsive, e.g., 5~seconds. As + outlined in Section~\ref{ctor:sec:base:phase3}, a timeout causes the watchdog + to send an SFO to the auditor. + \item[ct-delay-dist:] A distribution that determines how long a CTR should + wait at minimum before auditing a submitted SFO. As outlined in + Section~\ref{ctor:sec:base:phase2}, random noise is added, e.g., on the order + of minutes to an hour. + \item[ct-backoff-dist:] + A distribution that determines how long a CTR should wait between two + auditing instances, e.g., a few minutes on average. As outlined in + Section~\ref{ctor:sec:base:phase3}, CTRs audit pending SFOs in batches at + random time intervals to spread out log overhead. + \item[ct-watchdog-timeout:] A natural number that determines how long time + at most a watchdog waits before considering an SFO for reporting. Prevents + the watchdog from having to wait for a circuit timeout caused by an + unresponsive CTR. Should be set with \texttt{ct-backoff-dist} in mind. + \item[ct-auditor-timeout] A natural number that determines how long time at + most a watchdog waits for an auditor to acknowledge the submission of an SFO. +\end{description} + +\section{Log Operators \& Trust Anchors} \label{ctor:app:ct-trust-anchors} +The standardized CT protocol suggests that a log's trust anchors should +``usefully be the union of root certificates trusted by major browser +vendors''~\cite{ct,ct/bis}. Apple further claims that a log in their CT program +``must trust all root CA certificates included in Apple's trust +store''~\cite{apple-log-policy}. This bodes well for the incremental CTor +designs: + we assumed that the existence of independent log operators implies the + ability to at least add certificate chains and possibly complete SFOs + into logs that the attacker does not control. +Google's CT policy currently qualifies 36 logs that are hosted by + Cloudflare, + DigiCert, + Google, + Let's Encrypt, + Sectigo, and + TrustAsia~\cite{google-log-policy}. +No log accepts all roots, but the overlap between root certificates that are +trusted by major browser vendors and CT logs increased over +time~\cite{ct-root-landscape}. This trend would likely continue if there are +user agents that benefit from it, e.g., Tor Browser. Despite relatively few +log operators and an incomplete root coverage, the basic and extended +cross-logging in CTor still provide significant value as is: +\begin{itemize} + \item Even if there are no independent logs available for a certificate + issued by some CA, adding it again \emph{to the same logs} would come + with practical security gains. For example, if the attacker gained + access to the secret signing keys but not the logs' infrastructures + the mis-issued certificate trivially makes it into the public. If the + full SFO is added, the log operators could also notice that they were + compromised. + \item Most log operators only exclude a small fraction of widely accepted + root certificates: 1--5\%~\cite{ct-root-landscape}. This narrows down + the possible CAs that the attacker must control by 1--2 orders of + magnitude. In other words, to be entirely sure that CTor would (re)add + a mis-issued SFO to the attacker-controlled CT logs, this smaller group + of CAs must issue the underlying certificate. It is likely harder to + take control of Let's Encrypt which some logs and operators exclude due + to the sheer volume of issued certificates than, say, a smaller CA that + law enforcement may coerce. +\end{itemize} + +Browser-qualified or not, the availability of independent logs that accept the +commonly accepted root certificates provides significant ecosystem value. +Log misbehavior is mostly reported through the CT policy mailing list. Thus, it +requires manual intervention. Wide support of certificate chain and SCT +cross-logging allows anyone to \emph{casually} disclose suspected log +misbehavior on-the-fly. + +\section{Flushing a Single CTR} \label{ctor:app:flush} +Let $n$ be the number of SFOs that a CTR can store in its buffer. The +probability to sample a target SFO is thus $\frac{1}{n}$, and the probability to +not sample a target SFO is $q = 1 - \frac{1}{n}$. The probability to not sample +a target SFO after $k$ submissions is $q^k$. Thus, the probability to sample +the relevant buffer index at least once is $p = 1 - q^k$. Solving for $k$ we +get: $k = \frac{\log(1 - p)}{\log(q)}$. Substituting $q$ for $1 - \frac{1}{n}$ +yields Equation~\ref{ctor:eq:flush}, which can be used to compute the number of +SFO submissions that the attacker needs to flush a buffer of $n>2$ +entries with some probability~$p\in[0,1)$. + +\begin{equation} \label{ctor:eq:flush} + k = \frac{\log(1-p)}{\log(1 - \frac{1}{n})} +\end{equation} + +It is recommended that a non-exit relay should have at least 512MB of memory. +If the available bandwidth exceeds 40Mbps, it should have at least +1GB~\cite{relay-config}. Given that these recommendations are lower bounds, +suppose the average memory available to store SFOs is 1GiB. +Section~\ref{ctor:sec:performance} further showed that the average SFO size is +roughly 6KiB. This means that the buffer capacity is $n \gets 174763$ SFOs. +Plugging it into Equation~\ref{ctor:eq:flush} for $p \gets \frac{9}{10}$, the +attacker's flood must involve $k \gets 402406$ submissions. In other words, +2.3GiB must be transmitted to flush a single CTR with 90\% success probability. + +As a corner case and implementation detail it is important that Tor Browser and +CTRs \emph{reject} SFOs that are bogus in terms of size: it is a trivial DoS +vector to load data indefinitely. If such a threshold is added the required +flushing bandwidth is still 2.3GiB (e.g., use 1MiB SFOs in the above +computations). What can be said about bandwidth and potential adversarial +advantages is that a submitted SFO yields amplification: + twofold for cross-logging, and + slightly more for proof-fetching as the SFO is pushed up-front to a + watchdog. +Note that such amplification is smaller than a typical website visit. diff --git a/summary/src/ctor/src/background.tex b/summary/src/ctor/src/background.tex new file mode 100644 index 0000000..85d972f --- /dev/null +++ b/summary/src/ctor/src/background.tex @@ -0,0 +1,150 @@ +\section{Background} \label{ctor:sec:background} +The theory and current practise of CT is introduced first, then Tor +and its privacy-preserving Tor Browser. + +\subsection{Certificate Transparency} \label{ctor:sec:background:ct} +The idea to transparently log TLS certificates emerged at Google in response to +a lack of proposals that could be deployed without drastic ecosystem changes +and/or significant downsides~\cite{ct/a}. By making the set of issued +certificate chains\footnote{% + A domain owner's certificate is signed by an intermediate CA, whose + certificate is in turned signed by a root CA that acts as a trust + anchor~\cite{ca-ecosystem}. Such a \emph{certificate chain} is valid if it + ends in a trusted anchor that is shipped in the user's system software. +} transparent, anyone that inspect the logs can detect certificate +mis-issuance \emph{after the fact}. It would be somewhat circular to solve +issues in the CA ecosystem by adding trusted CT logs. Therefore, the +cryptographic foundation of CT is engineered to avoid any such reliance. +Google's \emph{gradual} CT roll-out started in 2015, and evolved from +downgrading user-interface indicators in Chrome to the current state of hard +failures unless a certificate is accompanied by a signed \emph{promise} that it +will appear in two CT logs~\cite{does-ct-break-the-web}. Unlike Apple's +Safari~\cite{apple-log-policy}, these two logs must additionally be operated by +Google and not-Google to ensure independence~\cite{google-log-policy}. + +The lack of mainstream verification, i.e., beyond checking signatures, allows an +attacker to side-step the current CT enforcement with minimal risk of exposure +\emph{if the required logs are controlled by the attacker}. +CTor integrates into the gradual CT roll-out by starting on the +premise of pairwise-independently trusted CT logs, which +avoids the risk of bad user experience~\cite{does-ct-break-the-web} +and significant system complexity. For example, web pages are unlikely to +break, TLS handshake latency stays about the same, and no robust management of +suspected log misbehavior is needed. Retaining the latter property as part of +our incremental designs simplifies deployment. + +\subsubsection{Cryptographic Foundation} +The operator of a CT log maintains a tamper-evident append-only Merkle +tree~\cite{ct,ct/bis}. At any time, a Signed Tree Head (STH) can be produced +which fixes the log's structure and content. Important attributes of an STH +include + the tree head (a cryptographic hash), + the tree size (a number of entries), and + the current time. +Given two tree sizes, a log can produce a \emph{consistency proof} that proves +the newer tree head entails everything that the older tree head does. As such, +anyone can verify that the log is append-only without downloading all entries +and recomputing the tree head. Membership of an entry can also be proven +by producing an \emph{inclusion proof} for an STH. These proof techniques are +formally verified~\cite{secure-logging-and-ct}. + +Upon a valid request, a log must add an entry and produce a new STH that covers +it within a time known as the Maximum Merge Delay (MMD), e.g., 24~hours. This +policy aspect can be verified because in response, a Signed Certificate +Timestamp (SCT) is returned. An SCT is a signed promise that an entry will +appear in the log within an MMD. A log that violates its MMD is said to perform +an \emph{omission attack}. It can be detected by challenging the log to prove +inclusion. A log that forks, presenting one append-only version +to some entities and another to others, is said to perform a \emph{split-view +attack}. Split-views can be detected by STH +gossip~\cite{chuat,dahlberg,nordberg,syta}. + +\subsubsection{Standardization and Verification} +The standardized CT protocol defines public HTTP(S) endpoints that allow anyone +to check the log's accepted trust anchors and added certificates, as well as +to obtain the most recent STH and to fetch proofs~\cite{ct,ct/bis}. For +example, the \texttt{add-chain} endpoint returns an SCT if the added certificate +chain ends in a trust anchor returned by the \texttt{get-roots} endpoint. We +use \texttt{add-chain} in Section~\ref{ctor:sec:incremental}, as well as several +other endpoints in Section~\ref{ctor:sec:base} to fetch proofs and STHs. It might be +helpful to know that an inclusion proof is fetched based on two parameters: a +certificate hash and the tree size of an STH. The former specifies the log entry +of interest, and the latter with regards to which view inclusion should be +proven. The returned proof is valid if it can be used in combination with the +certificate to reconstruct the STH's tree head. + +The CT landscape provides a limited value unless it is verified that the logs +play by the rules. What the rules are changed over time, but they are largely +influenced by the major browser vendors that define \emph{CT policies}. For +example, what is required to become a recognized CT log in terms of uptime and +trust anchors, and which criteria should pass to consider a certificate CT +compliant~\cite{apple-log-policy,google-log-policy}. While there are several ways that +a log can misbehave with regards to these policy aspects, the most fundamental +forms of cheating are omission and split-view attacks. A party that follows-up +on inclusion and consistency proofs is said to \emph{audit} the logs. + +Widespread client-side auditing is a premise for CT logs to be untrusted, but +none of the web browsers that enforce CT engage in such activities yet. For +example, requesting an inclusion proof is privacy-invasive because it leaks +browsing patterns to the logs, and reporting suspected log misbehavior comes +with privacy~\cite{ct-with-privacy} as well as operational challenges. +Found log incidents are mostly reported manually to the CT policy +list~\cite{ct-policy-mailing-list}. This is in contrast to automated +\emph{CT monitors}, which notify domain owners +of newly issued certificates based on what actually appeared in the public +logs~\cite{lwm,ct-monitors}. + +\subsection{Tor} \label{ctor:sec:background:tor} + +Most of the activity of Tor's millions of daily users starts with Tor Browser +and connects to some ordinary website via a circuit comprised of three +randomly-selected Tor relays. In this way no identifying information from +Internet protocols (such as IP address) are automatically provided to the +destination, and no single entity can observe both the source and destination of +a connection. Tor Browser is also configured and performs some filtering to resist +browser fingerprinting, and first party isolation to resist sharing state or +linking of identifiers across origins. More generally it avoids storing +identifying configuration and behavioral information to disk. + +Tor relays in a circuit are selected at random, but not uniformly. A typical +circuit is comprised of a \emph{guard}, a \emph{middle}, and an \emph{exit}. A +guard is selected by a client and used for several months as the entrance to all +Tor circuits. If the guard is not controlled by an adversary, that adversary +will not find itself selected to be on a Tor circuit adjacent to (thus +identifying) the client. And because some relay operators do not wish to act as +the apparent Internet source for connections to arbitrary destinations, relay +operators can configure the ports (if any) on which they will permit connections +besides to other Tor relays. Finally, to facilitate load balancing, relays are +assigned a weight based on their apparent capacity to carry traffic. In keeping +with avoiding storing of linkable state, even circuits that share an origin will +only permit new connections over that circuit for ten minutes. After that, if +all connections are closed, all state associated with the circuit is cleared. + +Tor clients use this information when choosing relays with which to build a +circuit. They receive the information via an hourly updated \emph{consensus}. +The consensus assigns weights as well as flags such as \texttt{guard} or +\texttt{exit}. It also assigns auxiliary flags such as +\texttt{stable}, which, e.g., +is necessary to obtain the \texttt{guard} flag since guards must have good +availability. Self-reported information by relays in their \emph{extra-info +document}, such as statistics on their read and written bytes, are also part of +the consensus and uploaded to \emph{directory authorities}. Directory +authorities determine the consensus by voting on various components making up +the shared view of the state of the Tor network. Making sure that all clients +have a consistent view of the network prevents epistemic attacks wherein clients +can be separated based on the routes that are consistent with their +understanding~\cite{danezis:pets2008}. This is only a very rough sketch of Tor's +design and operation. More details can be found by following links at Tor's +documentation site~\cite{tor-documentation}. + +Tor does not aim to prevent end-to-end correlation attacks. An adversary +controlling the guard and exit, or controlling the destination and observing the +client ISP, etc., is assumed able to confirm who is connected to whom on that +particular circuit. The Tor threat model assumes an adversary able to control +and/or observe a small to moderate fraction of Tor relays measured by both +number of relays and by consensus weight, and it assumes a large +number of Tor clients +able to, for example, flood individual relays to detect traffic signatures of +honest traffic on a given circuit~\cite{long-paths}. Also, the adversary can +knock any small number of relays offline via either attacks from clients or +direct Internet DDoS. diff --git a/summary/src/ctor/src/conclusion.tex b/summary/src/ctor/src/conclusion.tex new file mode 100644 index 0000000..c7f5508 --- /dev/null +++ b/summary/src/ctor/src/conclusion.tex @@ -0,0 +1,49 @@ +\section{Conclusion} \label{ctor:sec:conclusion} +We proposed CTor, a privacy-preserving and incrementally-deployable design that +brings CT to Tor. Tor Browser should start by taking the same proactive +security measures as Google Chrome and Apple's Safari: + require that a certificate is only valid if accompanied by at least two + SCTs. +Such CT enforcement narrows down the attack surface from the weakest-link +security of the CA ecosystem to a relatively small number of trusted log +operators \emph{without negatively impacting the user experience to an +unacceptable degree}. The problem is that a powerful attacker may gain control +of the required logs, trivially circumventing enforcement without significant +risk of exposure. If deployed incrementally, CTor relaxes the currently +deployed trust assumption by distributing it across all CT logs. If the full +design is put into operation, such trust is completely eliminated. + +CTor repurposes Tor relays to ensure that today's trust in CT logs is not +misplaced: + Tor Browser probabilistically submits the encountered certificates and SCTs + to Tor relays, which + cross-log them into independent CT logs (incremental design) + or request inclusion proofs with regards to a single fixed view + (full design). +It turns out that delegating verification to a party that can defer it +is paramount in our setting, both for privacy and security. Tor and the wider +web would greatly benefit from each design increment. The full design turns Tor +into a +system for maintaining a probabilistically-verified view of the entire CT log +ecosystem, provided in Tor's consensus for anyone to use as a basis of trust. +The idea to cross-log certificates and SCTs further showcase how certificate +mis-issuance and suspected log misbehavior could be disclosed casually without +any manual intervention by using the log ecosystem against the attacker. + +The attacker's best bet to break CTor involves any of the following: + operating significant parts of the CTor infrastructure, + spending a reliable Tor Browser zero-day that escalates privileges within a + tiny time window, or + targeting all Tor relays in an attempt to delete any evidence of certificate + mis-issuance and log misbehavior. +The latter---a so-called network-wide flush---brings us to the border of our +threat model, but it cannot be ignored due to the powerful attacker that we +consider. Therefore, CTor is designed so that Tor can \emph{adapt} in response +to interference. For example, in Tor Browser the \texttt{ct-large-sfo-size} +could be set reactively such that all SFOs must be sent to a CTR before +accepting any HTTPS application-layer data to counter zero-days, and the submit +probability \texttt{ct-submit-pr} could be increased if ongoing attacks are +suspected. When it comes to the storage phase, the consensus can minimize or +maximize the storage time by tuning a log's MMD in the \texttt{ct-log-info} +item. The distribution that adds random buffering delays could also be updated, +as well as log operator relationships during the auditing phase. diff --git a/summary/src/ctor/src/cross-logging.tex b/summary/src/ctor/src/cross-logging.tex new file mode 100644 index 0000000..ec6807d --- /dev/null +++ b/summary/src/ctor/src/cross-logging.tex @@ -0,0 +1,101 @@ +\section{Incremental Deployment} \label{ctor:sec:incremental} +Section~\ref{ctor:sec:base} covered the full design that places zero-trust in the CT +landscape by challenging the logs to prove certificate inclusion with regards to +trusted STHs in the Tor consensus. If no such proof can be provided, the +suspected evidence of log misbehavior is reported to a trusted CT auditor that +follows-up on the incident, which involves human intervention if an issue +persists. The proposed design modifies the Tor consensus, Tor relays, and Tor +Browser. It also requires development and operation of a trusted auditor +infrastructure. The current lack of the latter makes it unlikely that we will +see adoption of CTor in its full potential anytime soon, and begs the question +of increments that help us get there in the future. Therefore, we additionally +propose two incremental designs in this section. + +\begin{figure}[!t] + \centering + \includegraphics[width=\columnwidth]{src/ctor/img/design-incremental} + \caption{% + Incremental design that can be deployed without any + trusted CT auditors. Tor Browser still submits SFOs to CTRs on + independent Tor circuits for the sake of privacy and security. After + CTR buffering, the submitted certificates are \emph{cross-logged} by + adding them to independent CT logs (selected at random) that the + attacker does not control (inferred from accompanied SCTs). + } + \label{ctor:fig:cross-log} +\end{figure} + +Without the ability to rely on CT auditors, trust needs to be shifted elsewhere +because we cannot expect relay operators to take on the role. At the same time, +an incremental proposal needs to improve upon the status quo of +pairwise-independently trusted CT logs. These observations lead us towards the +trust assumption that \emph{at least some} of the CT logs are trustworthy. Such +an assumption is suboptimal, but it does provide a real-world security +improvement by significantly raising the bar from weakest-link(s) to quite the +opposite. + +The smallest change of the full design would be for watchdogs to report +suspected certificate mis-issuance to all CT logs, simply by using the public +\texttt{add-chain} API to make the SFO's certificate chain transparent. This +has the benefit of holding the CA accountable if \emph{some} log operator is +benign. Given that our attacker is risk-averse, reporting to a single +independent log\footnote{The independent log need not be trusted by the browser, +i.e., it could be specified separately in the Tor consensus. An operator that +runs such a log would help distribute trust and facilitate auditing. +Appendix~\ref{ctor:app:ct-trust-anchors} provides details on today's log ecosystem.} +that issued none of the accompanied SCTs would likely be sufficient. There is +also room for further simplification: there is no point in challenging the logs +to prove inclusion if the fallback behavior of no response only makes the issued +certificate public, not the associated SCTs. Thus, CTRs could opt to cross-log +immediately \emph{without ever distinguishing between certificates that are +benign and possibly fraudulent}. This results in the incremental design shown +in Figure~\ref{ctor:fig:cross-log}, which initially removes several system +complexities such as extra-info metrics, auditor infrastructure, watchdog +collaborations, and inclusion proof fetching against trusted STHs in Tor's +consensus. + +The drawback of certificate cross-logging is that the misbehaving CT logs cannot +be exposed. There is also a discrepancy between cross-logging and encouraging +the CT landscape to deploy reliable CT auditors. We therefore suggest a +minimal change to the basic cross-logging design that addresses both of these +concerns. This change is unfortunately to the API of CT logs and not Tor. The +proposed change is to allow cross-logging of a certificate's issued SCTs, e.g., +in the form of an \texttt{add-sfo} API that would replace \texttt{add-chain} +in Figure~\ref{ctor:fig:cross-log}. +This means that CTRs could expose both the mis-issued certificate and the logs +that violated their promises of public logging. At the same time, the +infrastructural part of a CT auditor is built directly into existing +CT logs: + accepting SFOs that need further investigation. +Such an API would be an ecosystem improvement in itself, providing a +well-defined place to report suspected log misbehavior on-the-fly +\emph{casually}, i.e., without first trying to resolve an SFO for an extended +time period from many different vantage points and then ultimately reporting it +manually on the CT policy mailing list. + +\textbf{Security Sketch.} +There are no changes to phase~1 because cross-logging is instantiated at CTRs. +Phases~3--4 are now merged, such that the encountered certificates are added to +independent CT logs that the attacker does/may not control. Watchdogs are no +longer needed since either the certificates are added to a log that the attacker +controls, or they are not (which makes them public). The other main difference takes place in phase~2, +during which CTRs buffer SFOs. The buffer time used to be lengthy due to taking +early signals and MMDs into account, but it is now irrelevant as no inclusion +proofs are fetched. The expected buffer time can therefore be shortened down +to \emph{minutes} that follow only from the randomness in the +\texttt{audit\_after} timestamp (for the sake of privacy), making network-wide +flushes impractical while at the same time reducing the time that a mis-issued +certificate stays unnoticed: + a benign log is likely to add an entry before all MMDs elapsed. + +The extended cross-logging also aims to expose log misbehavior. As such, it is +paramount that no cross-logged SFO becomes public before the issuing CT logs can +merge the mis-issued certificate reactively to avoid catastrophic impact. This +could be assured by buffering newly issued SFOs longer as in the full design, +which brings back the threat and complexity of minor impact scenarios. Another +option that is appealing for Tor (but less so for CT) is to operate the +\texttt{add-sfo} API with the expectation of \emph{delayed merges} that account +for MMDs before making an SFO public, effectively moving lengthy buffering from +CTRs to CT logs with persistent storage. Trillian-based CT logs already support +delayed merges of (pre)certificates, see +\texttt{sequencer\_guard\_window}~\cite{delayed-merge}. diff --git a/summary/src/ctor/src/design.tex b/summary/src/ctor/src/design.tex new file mode 100644 index 0000000..5b887fe --- /dev/null +++ b/summary/src/ctor/src/design.tex @@ -0,0 +1,377 @@ +\section{Design} \label{ctor:sec:base} +A complete design---a design that detects misbehavior by both CAs and CT logs +within our strong threat model---requires a considerable degree of +complexity. In this section we present such a full design by breaking it up +into four phases as shown in Figure~\ref{ctor:fig:design}, demonstrating the need for +the involved complexity in each step. Section~\ref{ctor:sec:incremental} presents +two incremental versions of the full design that are less complicated. The +first increment comes as the cost of having a weaker threat model and security +goal. The second increment does not have a weaker security goal but requires a +new CT log API. + +A design that starts by validating SCT signatures like Apple's Safari is +promising and assumed~\cite{apple-log-policy,apple-on-independence}, but it does +not stand up against a malicious CA and two CT logs that work in concert. If +the logs cannot be trusted blindly, the presented SCTs need to be audited. + +\begin{figure}[!t] + \centering + \includegraphics[width=\columnwidth]{src/ctor/img/design-full} + \caption{% + An overview of the four phases of the full CTor design. In phase 1 Tor + Browser submits an SFO (SCT Feedback Object) to a Certificate Transparency + Relay (CTR), followed by phase 2 where the CTR buffers the SFO. In phase 3 + the relay attempts to audit the SFO, and in case of failure, it reports the + SFO to an auditor with the help of a watchdog CTR in phase 4.} + \label{ctor:fig:design} +\end{figure} + +\subsection{Phase~1: Submission} \label{ctor:sec:base:phase1} + +The least complicated auditing design would be one where Tor Browser receives a +TLS certificate and accompanying SCTs (we will refer to this bundle as an SCT +Feedback Object, or SFO for short) and talks to the corresponding logs, over +Tor, requesting an inclusion proof for each SCT. In an ordinary browser, this +would be an unacceptable privacy leak to the log of browsing behavior associated +with an IP address; performing this request over Tor hides the user's IP address +but still leaks real-time browsing behavior. + +An immediate problem with this design is that a primary requirement of Tor +Browser is to persist no data about browsing behavior after the application +exits. If we assume that browsers are not left running for long periods of time, +the inclusion proof request can be easily circumvented by the attacker by using +a fresh SCT whose MMD has not completed---thus no inclusion proof needs to be +provided (yet) by the log as per the CT standard. A second problem is that the +STH that an inclusion proof refers to exists in a \emph{trust vacuum}: + there is no way to know that it is consistent with other STHs and not part + of a split view (assuming that there is no proactive STH + gossip~\cite{dahlberg,syta}, which is not deployed). + +We can evolve the design by adding two components: a list of STHs that Tor +Browser receives over a trusted channel and the participation of a trusted third +party with the ability to persist data and perform auditing actions at a later +point in time. + +A single third party used by all users of Tor Browser would receive a +considerable aggregation of browsing behavior and would need to scale in-line +with the entire Tor network. A small number of auditors presents privacy and +single-point-of-failure concerns. A large number would be ideal but presents +difficulties in curation and independent management and still requires scaling +independent of the Tor network. These concerns do not entirely preclude the +design, but they can be easily avoided by reusing relays in the Tor network as +our trusted third parties: we call the relays so designated Certificate +Transparency Relays (CTRs). + +Now, when the browser is completing the TLS handshake, it simultaneously either +passes the SFO to a CTR (if the MMD of the SCT has not elapsed) or queries the +log itself for an inclusion proof to a trusted STH\@. However, if we presume +the attacker can serve an exploit to the browser, the latter behavior is +immediately vulnerable. The log, upon receiving an inclusion proof request for +an SCT that it knows is malicious, can delay its response. The TLS connection in +the browser, having succeeded, will progress to the HTTP request and response, +at which point the exploit will be served, and the SFO (containing the +cryptographic evidence of CA and log misbehavior) will be deleted by the exploit +code. While blocking the TLS connection until the CT log responds is an option, +experience related to OCSP hard-fail indicates that this notion is likely doomed +to fail~\cite{no-hard-fail}. + +The final change of the design has Tor Browser submit the SFO to the CTR +immediately upon receipt (with some probability) in all cases. A consequence of +this shift is that the trusted STH list no longer needs to be delivered to the +browser but rather the CTRs. To mitigate the risk of a browser exploit being +able to identify the CTR to the attacker (who could then target it), we prepare +\emph{CTR circuits} ahead of time that are closed and discarded as soon as the +SFO is sent. This allows the SFO submission to race with the TLS connection +completion and HTTP request/response. An added detail is to block the TLS +connection in the case that an SFO is unusually large, as defined by a parameter +\texttt{ct-large-sfo-size}. A large SFO may indicate an attempt to win the race +between SFO submission and exploitation. The parameter can be set such that it +happens extremely rarely on legitimate connections, as shown in +Section~\ref{ctor:sec:performance}. + +We summarize phase~1 with the following algorithm that provides more explicit +steps and details, including the addition of a parameter \texttt{ct-submit-pr} +that indicates a probability that an SFO is submitted to a CTR. This provides +probabilistic security while providing the ability to adjust submission rates to +account for +CTR and more general network scaling/health issues. Given an incoming SFO $s$, +Tor Browser should: +\begin{enumerate} + \item Raise a certificate error and stop if the certificate chain of $s$ + is not rooted in Tor Browser's trust store. + \item Raise a certificate transparency error and stop if the SCTs of $s$ + fail Tor Browser's CT policy. + \item If $\mathsf{len}(s) < \texttt{ct-large-sfo-size}$, accept $s$ and + conduct the remaining steps in the background while the TLS connection + and subsequent HTTP request/response proceed. If $\mathsf{len}(s) \geq + \texttt{ct-large-sfo-size}$ pause the TLS handshake, complete the + remaining steps, accept~$s$ as valid and then continue the handshake. + \item Flip a biased coin based on \texttt{ct-submit-pr} and stop if the + outcome indicates no further auditing. + \item Submit $s$ to a random CTR on a pre-built circuit. The circuit used + for submission is closed immediately without waiting for any + acknowledgment. +\end{enumerate} + +\subsection{Phase 2: Buffering} \label{ctor:sec:base:phase2} + +Once received, the most straightforward thing for a CTR to do would be to +contact the issuing log and request an inclusion proof relative to a trusted +STH\@. (And if the SCT's MMD has not elapsed, hold the SFO until it has.) +However, this proposal has two flaws, the first of which leads us to the actual +design of phase 2. + +Immediately contacting the log about an SFO (i) allows the log to predict when +exactly it will receive a request about an SFO and (ii) discloses real-time +browsing behavior to the log. The former problem means that an attacker can +position resources for perpetuating an attack ahead-of-time, as well as letting +it know with certainty whether a connection was audited (based on +\texttt{ct-submit-pr}). The latter is some amount of information leakage that +can help with real-time traffic analysis. + +Because a CTR must support buffering SCTs regardless (due to the MMD), we can +schedule an event in the future for when each SFO should be audited. Adding a +per-SFO value sampled from \texttt{ct-delay-dist} effectively adds stop-and-go +mixing~\cite{kesdogan:ih1998} to the privacy protection, but where there is only +one mix (CTR) between sender (client) and receiver (CT log). So there is no +point in a client-specified interval-start-time such that the mix drops messages +arriving before then, and there is no additional risk in having the interval end +time set by the mix rather than the sender. This means both that some SFOs a +client sends to a CTR at roughly the same time might be audited at different +times and that SFOs submitted to that CTR by other honest clients are more +likely to be mixed with these. + +In addition to buffering SFOs for mixing effects, we also add a layer of caching +to reduce the storage overhead, prevent unnecessary log connections, and limit +the disclosure to logs. With regards to some CT circuit, an incoming SFO $s$ is +processed as follows by a CTR: +\begin{enumerate} + \item\label{ctor:enm:storage:close} Close the circuit to enforce one-time use. + \item\label{ctor:enm:storage:unrecognized} Discard all SCTs in the SFO for logs + the CTR is not aware of; if no SCT remains then discard the SFO. + \item\label{ctor:enm:storage:cached} Stop if $s$ is cached or already pending to + be audited in the buffer. See caching details in + Section~\ref{ctor:sec:performance:estimates}. + \item\label{ctor:enm:storage:fix-log} Sample a CT log $l$ that issued a + remaining SCT in~$s$. + \item\label{ctor:enm:storage:audit-after} Compute an \texttt{audit\_after} + time~$t$, see Figure~\ref{ctor:fig:audit-after}. + \item\label{ctor:enm:storage:store} Add $(l,t,s)$ to a buffer of pending SFOs to + audit. +\end{enumerate} + +What makes a CT log known to the CTR is part of the Tor consensus, see +Section~\ref{ctor:sec:base:consensus}. It implies knowledge of a trusted STH for the +sampled CT log $l$, which refers to an entity that (i) issued an SCT in the +submitted SFO, and (ii) will be challenged to prove inclusion in phase~3 +sometime after the \texttt{audit\_after} timestamp $t$ elapsed. We choose one +SCT (and thus log) at random from the SFO because it is sufficient to suspect +only one misbehaving log so long as we report the entire SFO, allowing us to +identify the other malicious CT logs later on (a risk averse-attacker would not +conduct an attack without controlling enough logs, i.e., one benign log would +otherwise make the mis-issued certificate public). + +\begin{figure}[!t] + \centering + \pseudocode[linenumbering, syntaxhighlight=auto]{% + \textrm{t} \gets \mathsf{now}() + + \mathsf{MMD} + + \mathsf{random}(\texttt{ct-delay-dist}) \\ + \pcif \textrm{SCT.timestamp} + \textrm{MMD} < + \mathsf{now}():\\ + \pcind\textrm{t} \gets \mathsf{now}() + + \mathsf{random}(\texttt{ct-delay-dist}) + } + \caption{% + Algorithm that computes an \texttt{audit\_after} timestamp $t$. + } + \label{ctor:fig:audit-after} +\end{figure} + +The \texttt{audit\_after} timestamp specifies the earliest point in time that an +SCT from an SFO will be audited in phase~3, which adds random noise that +obfuscates real-time browsing patterns in the Tor network and complicates +predictions of when it is safe to assume no audit will take place. If memory +becomes a scarce resource, pending triplets should be deleted at +random~\cite{nordberg}. Figure~\ref{ctor:fig:audit-after} shows that $t$ takes the +log's MMD into account. This prevents an \emph{early signal} to the issuing CT +logs that an SFO is being audited. For example, if an SFO is audited before the +MMD elapsed, then the issuing CT log could simply merge the underlying +certificate chain to avoid any MMD violation. However, by taking the MMD into +account, this results in a relatively large time window during which the +attacker can attempt to \emph{flood} all CTRs in hope that they delete the +omitted SFO at random before it is audited. We discuss the threat of flooding +further in Section~\ref{ctor:sec:analysis}, noting that such an attack can +be detected if CTRs publish two new metrics in the extra-info document: +\texttt{ct-receive-bytes} and \texttt{ct-delete-bytes}. These metrics indicate +how many SFO bytes were received and deleted throughout different time +intervals, which is similar to other extra-info metrics such as +\texttt{read-history} and \texttt{write-history}. + +\subsection{Phase 3: Auditing} \label{ctor:sec:base:phase3} + +As alluded to in phase 2, there is a second problem why the simple behavior of +``contact the log and request an inclusion proof'' is unacceptable. We include +the ability to DoS an individual Tor relay in our threat model---if the log +knows which CTR holds the evidence of its misbehavior, it can take the CTR +offline, wiping the evidence of the log's misbehavior from its memory. + +We can address this concern in a few ways. The simple proposal of contacting the +log over a Tor circuit will not suffice: + a log can tag each CTR by submitting unique SFOs to them all, and + recognize the CTR when they are submitted (see + Section~\ref{ctor:sec:analysis}). +Even using a unique Tor circuit for each SFO might not suffice to prevent +effective tagging attacks. For example, after tagging all CTRs, a malicious log +could ignore all but innocuous untagged requests and tagged requests matching +tags for whichever CTR it decides to respond to first. If some kind of +back-off is supported (common to delay retransmissions and avoid congestion), +the rest of the CTRs will likely be in back-off so that there is a high +probability that the first CTR is the one fetching proofs. The log can repeat +this process---alternating tagged CTRs it replies to---until it receives the +offending SFO from an identifiable CTR with high probability. CTRs may report +the log as inaccessible for days, but that is not the same as direct +cryptographic evidence of misbehavior. + +While there are ways to detect this attack after-the-fact, and there may be ways +to mitigate it, a more robust design would tolerate the disclosure of a CTRs +identity to the log during the auditing phase without significant security +implications. A simple appealing approach is to write the data to disk prior +to contacting the log; however, Tor relays are explicitly designed not to write +data about user behavior to disk unless debug-level logging is enabled. Relay +operators have expressed an explicit desire to never have any user data +persisted to disk, as it changes the risk profile of their servers with regards +to search, seizure, and forensic analysis. + +The final design is to have the CTR work with a partner CTR---we call it a +\emph{watchdog}---that they choose at random and contact over a circuit. Prior +to attempting to fetch a proof from a log, the CTR provides the watchdog with +the SFO it is about to audit. After an appropriate response from the log, the +CTR tells the watchdog that the SFO has been adequately addressed. + +In more detail, each CTR maintains a single shared circuit that is used to +interact with all CT logs known to the CTR (we are not using one circuit per +SFO given the overhead and unclear security benefit noted above). For +\emph{each} such log $l$, the CTR runs the following steps: %indefinitely: +\begin{enumerate} + \item\label{ctor:enm:auditing:backoff} Sample a delay $d \gets + \mathsf{random}(\texttt{ct-backoff-dist})$ and wait until $d$ time units + elapsed. + \item Connect to a random watchdog CTR\@. + \item\label{ctor:enm:auditing:loop} For each pending buffer entry $(l',s,t)$, + where $l' = l$ and $t <= \mathsf{now}()$: + \begin{enumerate} + \item\label{ctor:enm:ext:auditing:watchdog} Share $s$ with the current + watchdog. + \item\label{ctor:enm:ext:auditing:challenge} Challenge the log to prove + inclusion to the closest STH in the Tor + consensus where $t$ $\leq$ + STH\texttt{.timestamp}. Wait + \texttt{ct-log-timeout} time units for the + complete proof before timing out. + \begin{itemize} + \item\label{ctor:enm:ext:auditing:challenge:success} On valid + proof: send an acknowledgment to the watchdog, cache $s$ + and then discard it. + \item\label{ctor:enm:ext:auditing:challenge:fail} On any other + outcome: close circuit to the watchdog CTR, discard $s$, + and go to step~1. + \end{itemize} + \end{enumerate} +\end{enumerate} + +\subsection{Phase 4: Reporting} + +At any given time, a CTR may be requesting inclusion proofs from logs and act as +a watchdog for one or more CTRs. A CTR acting as a watchdog will have at +most one SFO held temporarily for each other CTR it is interacting with. If an +acknowledgement from the other CTR is not received within +\texttt{ct-watchdog-timeout}, it becomes the watchdog's responsibility to report +the SFO such that it culminates in human review if need be. + +Because human review and publication is critical at this end-stage, we envision +that the watchdog (which is a Tor relay that cannot persist any evidence to disk +and may not be closely monitored by its operator) provides the SFO to an +independent CT auditor that is run by someone that closely monitors its +operation. When arriving at the design of the CTR being a +role played by a Tor relay, we eschewed separate auditors because of the lack of +automatic scaling with the Tor network, the considerable aggregation of browsing +behavior across the Tor network, and the difficulties of curation and validation +of trustworthy individuals. SFOs submitted to auditors at this stage have been +filtered through the CTR layer (that additionally backs-off if the logs become +unavailable to prevent an open pipe of SFOs from being reported), resulting in +an exponentially smaller load and data exposure for auditors. This should allow +for a smaller number of them to operate without needing to scale with the +network. + +While we assume that most auditors are trusted to actually investigate the +reported SFOs further, the watchdog needs to take precautions talking to them +because the network is not trusted.\footnote{% + While our threat model, and Tor's, precludes a global network adversary, + both include partial control of the network. +} The watchdog can contact the auditor immediately, but must do so over +an independent Tor circuit.\footnote{% + This is also important because CTRs are not necessarily exits, i.e., the + exiting traffic must be destined to another Tor relay. +} If a successful acknowledgement from the auditor is not received within +\texttt{ct-auditor-timeout}, the SFO is buffered for a random time using +\texttt{ct-delay-dist} before being reported to the same auditor again over a +new independent Tor circuit. + +When an auditor receives an SFO, it should persist it to durable storage until +it can be successfully resolved to a specific STH.\footnote{% + The fetched inclusion proof must be against the first known STH that + should have incorporated the certificate in question by using the + history of STHs in Tor's consensus: + the mis-issued certificate might have been merged into the log + reactively upon learning that a CTR reported the SFO, such that a valid + inclusion proof can be returned with regards to a more recent STH but + not earlier ones that actually captured the log's misbehavior. +} Once so persisted, the auditor can begin querying the log itself asking for +an inclusion proof. If no valid inclusion proof can be provided after some +threshold of time, the auditor software should raise the details to a human +operator for investigation. + +Separately, the auditor should be retrieving the current Tor consensus and +ensuring that a consistency proof can be provided between STHs from the older +consensus and the newer. If consistency cannot be established after some +threshold of time, the auditor software should raise the details to a human +operator for investigation. An auditor could also monitor a log's uptime and +report on excessive downtime. Finally, it is paramount that the auditor +continuously monitors its own availability from fresh Tor-circuits by submitting +known SFOs to itself to ensure that an attacker is not keeping watchdogs from +connecting to it. + +\subsection{Setup} \label{ctor:sec:base:consensus} + +There are a number of additional details missing to setup phases 1--4 for the +design. Most of these details relate to the Tor consensus. Directory authorities +influence the way in which Tor Browser and CTRs behave by voting on necessary +parameters, such as the probability of submission of an SFO +(\texttt{ct-submit-pr}) and the timeout used by CTRs when auditing CT logs +(\texttt{ct-log-timeout}), as introduced earlier as part of the design. See +Appendix~\ref{ctor:app:consensus-params} for details on these parameters and their +values that were previously used. Next, we briefly introduce a number of +implicitly used parts from our design that should also be part of the consensus. + +In the consensus, the existing \texttt{known-flags} item determines the +different flags that the consensus might contain for relays. We add another +flag named \texttt{CTR}, which indicates that a Tor relay should support +CT-auditing as described here. A relay qualifies as a CTR if it is flagged as +\texttt{stable} and not \texttt{exit}, to spare the relatively sparse exit +bandwidth and only use relays that can be expected to stay online. +Section~\ref{ctor:sec:privacy} discusses trade-offs in the assignment of the +\texttt{CTR} flag. + +The consensus should also capture a fixed view of the CT log ecosystem by +publishing STHs from all known logs. A CT log is known if a majority of +directory authorities proposed a \texttt{ct-log-info} item, which contains a +log's ID, public key, base URL, MMD, and most recent STH. Each directory +authority proposes its own STH, and agrees to use the most recent STH as +determined by timestamp and lexicographical order. Since CTRs verify inclusion +with regards to SCTs that Tor Browser accepts, the CT logs recognized by Tor +Browser must be in Tor's consensus. + +Tor's directory authorities also majority-vote on \texttt{ct-auditor} items, +which pin base URLs and public keys of CT auditors that watchdogs contact in +case that any log misbehavior is suspected. diff --git a/summary/src/ctor/src/introduction.tex b/summary/src/ctor/src/introduction.tex new file mode 100644 index 0000000..2206ec5 --- /dev/null +++ b/summary/src/ctor/src/introduction.tex @@ -0,0 +1,183 @@ +\section{Introduction} \label{ctor:sec:introduction} +Metrics reported by Google and Mozilla reveal that encryption on the web +skyrocketed the past couple of years: at least 84\% of all web pages load using +HTTPS~\cite{google-metrics,mozilla-metrics}. An HTTPS connection is initiated by +a TLS handshake where the client's web browser requires that the web server +presents a valid certificate to authenticate the identity of the server, e.g., +to make sure that the client who wants to visit \texttt{mozilla.org} is really +connecting to Mozilla, and not, say, Google. A certificate specifies the +cryptographic key-material for a given domain name, and it is considered valid +if it is digitally signed by a Certificate Authority (CA) that the web browser +trusts. + +It is a long-known problem that the CA trust model suffers from +weakest-link security: + web browsers allow hundreds of CAs to sign arbitrary domain-name to + key-bindings, + which means that it suffices to compromise a single CA to acquire any + certificate~\cite{https-sok,ca-ecosystem}. +Motivated by prominent CA compromises, such as the issuance of fraudulent +certificates for + \texttt{*.google.com}, + \texttt{*.mozilla.org} and + \texttt{*.torproject.org} +by DigiNotar~\cite{diginotar}, multiple browser vendors mandated +that certificates issued by CAs must be publicly disclosed in Certificate +Transparency (CT) logs to be valid. The idea behind CT is that, by making all +CA-issued certificates transparent, mis-issued ones can be detected +\emph{after the fact}~\cite{ct/a,ct,ct/bis}. The appropriate actions can then +be taken to keep the wider web safe, e.g., by + investigating the events that lead up to a particular incident, + removing or limiting trust in the offending CA, and + revoking affected certificates. +Google Chrome and Apple's Safari currently enforce CT by augmenting the TLS +handshake to require cryptographic proofs from the server that the presented +certificate \emph{will appear} in CT logs that the respective web browsers +trust~\cite{apple-log-policy,google-log-policy}. + +In addition to increased encryption on the web, the ability to access it +anonymously matured as well. Tor with its Tor Browser has millions of daily +users~\cite{tor,mani}, and efforts are ongoing to mature the technology +for wider use~\cite{fftor}. Tor Browser builds on-top of Mozilla's Firefox: + it relays traffic between the user and the web server in question by routing + everything through the Tor network, + which is composed of thousands of volunteer-run relays that are located + across the globe~\cite{relay-by-flag}. +Just like attackers may wish to break security properties of HTTPS, it may also +be of interest to break the anonymity provided by Tor. A common technique for +deanonymization (known to be used in practice) is to compromise Tor +Browser instead of circumventing the anonymity provided by +Tor~\cite{lepop1,selfrando,lepop2,zerotor}. Web browsers like Firefox +(or forks thereof) are one of the most complex software types that are widely +used today, leading to security vulnerabilities and clear incentives for +exploitation. For example, the exploit acquisition platform Zerodium offers up +to \$$100,000$ for a Firefox zero-day exploit that provides remote code +execution and local privilege escalation (i.e., full control of the +browser)~\cite{zeromain}. + +An attacker that wishes to use such an exploit to compromise and then ultimately +deanonymize a Tor Browser user has to deliver the exploit somehow. Since the +web is mostly encrypted, this primarily needs to take place over an HTTPS +connection where the attacker controls the content returned by the web server. +While there are numerous possible ways that the attacker can accomplish this, +e.g., by compromising a web server that a subset of Tor Browser users visit, +another option is to \emph{impersonate} one or more web servers by acquiring +fraudulent certificates. Due to the Tor network being run by volunteers, getting +into a position to perform such an attack is relatively straightforward: + the attacker can volunteer to run malicious exit + relays~\cite{spoiled-onions}. +The same is true for an attacker that wishes to man-in-the-middle connections +made by Tor Browser users. In some cases a Tor Browser exploit may not even be +needed for deanonymization, e.g., the attacker can observe if the user logs-on +to a service linking an identity. + +\subsection{Introducing CTor} +We propose an incrementally deployable and privacy-preserving design that is +henceforth referred to as CTor. By bringing CT to Tor, HTTPS-based +man-in-the-middle attacks against Tor Browser users can be detected \emph{after +the fact} when conducted by attackers that: +\begin{enumerate} + \item can acquire any certificate from a trusted CA, + \item with the necessary cryptographic proofs from enough CT logs so that + Tor Browser accepts the certificate as valid without the attacker + making it publicly available in any of the controlled logs, and + \item with the ability to gain full control of Tor Browser shortly after + establishing an HTTPS connection. +\end{enumerate} + +The first and third capabilities are motivated directly by shortcomings in the +CA ecosystem as well as how the anonymity of Tor Browser is known to be +attacked. The second capability assumes the same starting point as Google +Chrome and Apple's Safari, namely, that the logs are trusted to \emph{promise} +public logging, which is in contrast to being untrusted and thus forced to +\emph{prove} it. This is part of the gradual CT deployment that avoided +breakage on the web~\cite{does-ct-break-the-web}. Therefore, we start +from the assumption that Tor Browser accepts a certificate as valid if +accompanied by two independent promises of public logging. The limitation of +such CT enforcement is that it is trivially bypassed by an attacker that +controls two seemingly independent CT logs. This is not to say that trusting +the log ecosystem would be an insignificant Tor Browser improvement when +compared to no CT at all, but CTor takes us several steps further by relaxing +and ultimately eliminating the trust which is currently (mis)placed in today's +browser-recognized CT logs. +We already observed instances of CT logs that happened to + violate their promises of public logging~\cite{gdca1-omission}, + show inconsistent certificate contents to different + parties~\cite{izenpe-disqualified,venafi-disqualified}, and + get their secret signing keys compromised due to disclosed remote + code-execution vulnerabilities~\cite{digicert-log-compromised}. + +The first design increment uses the CT landscape against the attacker to +ensure a non-zero (tweakable) probability of public disclosure \emph{each time} +a fraudulent certificate is used against Tor Browser. This is done by randomly +adding a subset of presented certificates to CT logs that the attacker may not +control (inferred from the accompanied promises of public logging). Such +\emph{certificate cross-logging} distributes trust across all CT logs, raising +the bar towards unnoticed certificate mis-issuance. Motivated by factors like +privacy, security and deployability, Tor Browser uses Tor relays as +intermediates to cache and interact with CT logs on its behalf. Such deferred +auditing is a fundamental part of our setting unless future distributed auditing +mechanisms turn out to be non-interactive from the browser's perspective. + +The next incremental step is to not only cross-log certificates but also their +promises of public logging. While it requires an additional CT log API +endpoint, it facilitates auditing of these promises if some logs are +trustworthy. The full design also holds logs accountable but without any such +assumption: + Tor relays challenge the logs to prove correct operation with regards to a + single fixed view in Tor's consensus, and + potential issues are reported to auditors that investigate them further. + +\subsection{Contribution and Structure} +Section~\ref{ctor:sec:background} introduces background on the theory and practise of +CT, as well as the anonymity network Tor. Section~\ref{ctor:sec:adversary} motivates +the intended attacker and presents a unified threat model for CT and Tor. +Section~\ref{ctor:sec:base} describes the full CTor design that \emph{eliminates all +trust in the browser-recognized CT logs} by challenging them to prove +certificate inclusion cryptographically, and would result in a \emph{single +probabilistically-verified view of the CT log ecosystem available from Tor's +consensus}. This view could be used by other browsers as the basis of trust, +\emph{greatly improving the security posture of the entire web}. The security +analysis in Section~\ref{ctor:sec:analysis} shows that one of the best bets for the +attacker would be to take network-wide actions against Tor to avoid public +disclosure of certificate mis-issuance and log misbehavior. Such an attack is +trivially detected, but it is hard to attribute unless reactive defenses are +enabled at the cost of trade-offs. + +The full design involves many different components that add deployment burdens, +such as the requirement of reliable CT auditors that investigate suspected log +misbehavior further. Therefore, we additionally propose two initial increments +that place \emph{some trust in CT logs} (Section~\ref{ctor:sec:incremental}). The +first increment \emph{provides evidence to independent CT logs that fraudulent +certificates were presented while preserving privacy}. This greatly impacts +risk-averse attackers because one part of their malicious behavior becomes +transparent \emph{if the randomly selected log operator is benign}. For +example, the targeted domain name is disclosed as part of the cross-logged +certificate, and awareness of the event draws unwanted attention. + +The next increment is minor from the perspective of Tor, but requires CT logs to +support an additional API. Similar changes were proposed in the context of CT +gossip~\cite{minimal-gossip}. If supported, Tor relays could expose both the +mis-issued certificates and the operators that promised to log them publicly +\emph{without the complexity of ever distinguishing between what is benign and +fraudulent}. +This API change happens to also build auditor infrastructure +directly into CT log software, thereby paving the path towards the missing component of +the full design. We argue that CTor can be deployed incrementally: + complete Firefox's CT enforcement~\cite{ffct}, + add our cross-logging increments, and + finally put the full design into operation. +Each part of CTor would \emph{greatly contribute to the open question of how +to reduce and/or eliminate trust in browser-recognized log operators}, which is +caused by the lack of an appropriate gossip mechanism as well as privacy issues +while interacting with the logs~\cite{ct-with-privacy,minimal-gossip,nordberg}. + +We show that circuit-, bandwidth- and memory-\emph{overheads are modest} by +computing such estimates in Section~\ref{ctor:sec:performance}. Therefore, we do not +investigate performance further in any experimental setting. +Section~\ref{ctor:sec:privacy} discusses privacy aspects of our design choices with +a focus on the essential role of the Tor network's distributed nature to +preserve user privacy as well as the overall security. In gist, +\emph{a similar approach would be privacy-invasive without Tor}, e.g., if +adopted by Google Chrome. Section~\ref{ctor:sec:related} outlines related work. +Section~\ref{ctor:sec:conclusion} concludes the paper. diff --git a/summary/src/ctor/src/performance.tex b/summary/src/ctor/src/performance.tex new file mode 100644 index 0000000..e641ba1 --- /dev/null +++ b/summary/src/ctor/src/performance.tex @@ -0,0 +1,142 @@ +\section{Performance} \label{ctor:sec:performance} +The following analysis shows that CTor's overhead is modest based on computing +performance estimates from concrete parameter properties and two public data +sets. + +\subsection{Setup} +Mani~\emph{et~al.} derived a distribution of website visits over Tor and an +estimation of the number of circuits through the network~\cite{mani}. We use +their results to reason about overhead as the Tor network is under heavy load, +assuming 140~million daily website visits (the upper bound of a 95\% confidence +interval). Our analysis also requires a distribution that captures typical SFO +properties per website visit. Therefore, we collected an SFO data set by +browsing the most popular webpages submitted to Reddit (r/frontpage, all time) +on December 4, 2019. The data set contains SFOs from 8858 webpage visits, and +it is available online as an open access artifact together with the associated +scripts~\cite{sfo-dist}. Notably we hypothesized that browsing actual webpages +as opposed to front-pages would yield more SFOs. When compared to Alexa's +list it turned out to be the case: + our data set has roughly two additional SFOs per data point. +This makes it less likely that our analysis is an underestimate. + +We found that an average certificate chain is 5440~bytes, and it is seldom +accompanied by more than a few SCTs. As such, a typical SFO is in the order of +6~KiB. No certificate chain exceeded 20~KiB, and the average number of SFOs per +webpage was seven. The latter includes 1--2 SFOs per data point that followed +from our client software calling home on start-up (Chromium~77). + +We assume no abnormal CTor behavior, which means that there will be little or +no CTR back-offs due to the high uptime requirements of today's CT logs: 99\%. +We set \texttt{ct-large-sfo-size} conservatively to avoid blocking in the TLS +handshake (e.g., 20~KiB), and use a 10\% submission probability as well as a +10~minute random buffer delay on average. It is likely unwarranted to use a +higher submission probability given that the intended attacker is risk-averse. +Shorter buffer times would leak finer-grained browsing patterns to the logs, +while longer ones increase the attack surface in phase~2. Therefore, we +selected an average for \texttt{ct-delay-dist} that satisfies none of the two +extremes. The remaining CTor parameters are timeouts, which have little or no +performance impact if set conservatively (few seconds). + +\subsection{Estimates} \label{ctor:sec:performance:estimates} +The incremental cross-logging designs are analyzed first without any caching. +Caching is then considered, followed by overhead that appears only in the full +design. + +\textbf{Circuit Overhead.} +Equation~\ref{ctor:eq:sub-oh} shows the expected circuit overhead from Tor Browser +over time, where $p$ is the submit probability and $\bar{d}$ the average number +of SFOs per website visit. The involved overhead is linear as either of the two +parameters are tuned up or down. + +\begin{equation} \label{ctor:eq:sub-oh} + p\bar{d} +\end{equation} + +Using $p\gets\frac{1}{10}$ and our approximated SFO distribution $\bar{d}\gets7$ +yields an average circuit overhead of $0.70$, i.e., for every three Tor Browser +circuits CTor adds another two. Such an increase might sound +daunting at first,\footnote{% + Circuit establishment involves queueing of onionskins~\cite{onionskins} and + it is a likely bottleneck, but since the introduction of ntor it is not a + scarce resource so such overhead is acceptable if it (i) serves a purpose, + and (ii) can be tuned. Confirmed by Tor developers. +} but these additional circuits are short-lived and light-weight; transporting +6~KiB on average. Each CTR also maintains a long-lived circuit for CT log +interactions. + +\textbf{Bandwidth Overhead.} Equation~\ref{ctor:eq:bw} shows the expected +bandwidth overhead for the Tor network over time, where + $V$ is the number of website visits per time unit, + $p$ the submit probability, + $\bar{d}$ the average number of SFOs per website visit, and + $\bar{s}$ the average SFO byte-size. + +\begin{equation} \label{ctor:eq:bw} + 6Vp\bar{d}\bar{s} +\end{equation} + +$Vp\bar{d}$ is the average number of SFO submissions per time unit, which can be +converted to bandwidth by weighting each submission with the size of +a typical SFO and accounting for it being relayed six times: + three hops from Tor Browser to a CTR, then + another three hops from the CTR to a CT log + (we assumed symmetric Tor relay bandwidth). +Using + $V\gets 140\textrm{~M/day}$, + $p \gets \frac{1}{10}$, + $\bar{d} \gets 7$, + $\bar{s} \gets 6\textrm{~KiB}$ +and converting the result to bps yields 334.5~Mbps in total. Such order of +overhead is small when compared to Tor's capacity: +450~Gbps~\cite{tor-bandwidth}. + +\textbf{Memory Overhead.} +Equation~\ref{ctor:eq:memory} shows the expected buffering overhead, where + $V_m$ is the number of website visits per minute, + $t$ the average buffer time in minutes, + $R$ the number of Tor relays that qualify as CTRs, and + $\bar{s}$ the typical SFO size in bytes. + +\begin{equation} \label{ctor:eq:memory} + \frac{V_mt}{R} \bar{s} +\end{equation} + +$V_mt$ represent incoming SFO submissions during the average buffer time, which +are randomly distributed across $R$ CTRs. Combined, this yields the expected +number of SFOs that await at a single CTR in phase~2, and by taking the +byte-size of these SFOs into account we get an estimate of the resulting memory +overhead. Using + $V_m \gets \frac{140\textrm{~M}}{24\cdot60}$, + $t \gets 10$~m, + $R \gets 4000$ based on the CTR criteria in + Section~\ref{ctor:sec:base:consensus}, and + $\bar{s} \gets 6\textrm{~KiB}$ +yields 1.42~MiB. Such order of overhead is small when compared to the +recommended relay configuration: + at least 512~MiB~\cite{relay-config}. + +A cache of processed SFOs reduces the CTR's buffering memory and log +interactions proportionally to the cache hit ratio. Mani~\emph{et al.} showed +that if the overrepresented \texttt{torproject.org} is removed, about one third +of all website visits over Tor can be attributed to Alexa's top-1k and another +one third to the top-1M~\cite{mani}. +Assuming 32~byte cryptographic hashes and seven SFOs per website visit, a cache +hit ratio of $\frac{1}{3}$ could be achieved by a 256~KiB LFU/LRU cache that +eventually captures Alexa's top-1k. Given that the cache requires memory as +well, this is mainly a bandwidth optimization. + +\textbf{Full Design.} +For each CTR and CT log pair, there is an additional watchdog circuit that +transports the full SFO upfront before fetching an inclusion proof. The +expected bandwidth overhead is at most $9Vp\bar{d}\bar{s}$, i.e., now +also accounting for the three additional hops that an SFO is subject to. In +practise the overhead is slightly less, because an inclusion query and its +returned proof is smaller than an SFO. We expect little or no +watchdog-to-auditor overhead if the logs are available, and otherwise one +light-weight circuit that reports a single SFO for each CTR that goes into +back-off. Such overhead is small when compared to all Tor Browser submissions. +Finally, the required memory increases because newly issued SFOs are buffered +for at least an MMD. Only a small portion of SFOs are newly issued, however: + the short-lived certificates of Let's Encrypt are valid for + 90~days~\cite{le}, which is in contrast to 24~hour + MMDs~\cite{google-log-policy}. diff --git a/summary/src/ctor/src/privacy.tex b/summary/src/ctor/src/privacy.tex new file mode 100644 index 0000000..2738dba --- /dev/null +++ b/summary/src/ctor/src/privacy.tex @@ -0,0 +1,48 @@ +\section{Privacy} \label{ctor:sec:privacy} +There is an inherent privacy problem in the setting due to how CT is designed +and deployed. A browser, like Tor Browser, that wishes to validate that SFOs presented to +it are \emph{consistent} and \emph{included} in CT logs must directly or +indirectly interact with CT logs wrt. its observed SFOs. Without protections +like Private Information Retrieval (PIR)~\cite{PIR} that require server-side +support or introduction of additional parties and trust +assumptions~\cite{kales,lueks-and-goldberg}, exposing SFOs to any party risks +leaking (partial) information about the browsing activities of the user. + +Given the constraints of the existing CT ecosystem, CTor is made +privacy-preserving thanks to the distributed nature of Tor with its anonymity +properties and high-uptime relays that make up the Tor network. First, all +communication between Tor Browser, CTRs, CT logs, and auditors are made over +full Tor-circuits. This is a significant privacy-gain, not available, e.g., to +browsers like Chrome that in their communications would reveal their public +IP-address (among a number of other potentially identifying metadata). Secondly, +the use of CTRs as intermediaries probabilistically delays the interaction with +the CT logs---making correlating Tor Browser user browsing with CT log +interaction harder for attackers---and safely maintains a dynamic cache of the +most commonly already verified SFOs. While browsers like Chrome could maintain a +cache, Tor Browser's security and privacy goals +(Section~\ref{ctor:sec:background:tor}) prohibit such shared (persisted) dynamic +state. + +In terms of privacy, the main limitation of CTor is that CTor continuously leaks +to CT logs---and to a \emph{lesser extent} auditors (depending on design)---a +fraction of certificates of websites visited using Tor Browser to those that +operate CT logs. This provides to a CT log a partial list of websites visited +via the Tor network over a period of time (determined by +\texttt{ct-delay-dist}), together with some indication of distribution based on +the number of active CTRs. It does not, however, provide even pseudonymously any +information about which sites individual users visit, much less with which +patterns or timing. As such it leaks significantly less information than does +OCSP validation by Tor Browser or DNS resolution at exit-relays~\cite{TorDNS}, +both of which indicate visit activity in real time to a small number of +entities. + +Another significant limitation is that relays with the CTR flag learn real-time +browser behavior of Tor users. Relays without the \texttt{exit} flag primarily +only transport encrypted Tor-traffic between clients and other relays, never to +destinations. If such relays are given the CTR flag---as we stated in the full +design, see Section~\ref{ctor:sec:base:consensus}---then this might discourage some +from running Tor relays unless it is possible to opt out. Another option is to +give the CTR flag only to exit relays, but this \emph{might be} undesirable for +overall network performance despite the modest overhead of CTor +(Section~\ref{ctor:sec:performance}). Depending on the health of the network and the +exact incremental deployment of CTor, there are different trade-offs. diff --git a/summary/src/ctor/src/ref.bib b/summary/src/ctor/src/ref.bib new file mode 100644 index 0000000..b39ae33 --- /dev/null +++ b/summary/src/ctor/src/ref.bib @@ -0,0 +1,536 @@ +@misc{apple-on-independence, + author = {Clint Wilson}, + title = {{CT} Days 2020}, + howpublished = {\url{https://groups.google.com/a/chromium.org/g/ct-policy/c/JWVVhZTL5RM}, accessed 2020-12-15} +} + +@misc{onionskins, + author = {{Tor Project}}, + title = {Functions to queue create cells for processing}, + howpublished = {\url{https://src-ref.docs.torproject.org/tor/onion__queue_8c_source.html}, accessed 2020-12-15}, +} + +@misc{delayed-merge, + author = {{Google LLC.}}, + title = {Trillian Log Signer}, + howpublished = {\url{https://github.com/google/trillian/blob/master/cmd/trillian_log_signer/main.go}, accessed 2020-12-15}, +} + +@misc{stark, + title = {Opt-in {SCT} Auditing}, + author = {Emily Stark and Chris Thompson}, + howpublished = {\url{https://docs.google.com/document/d/1G1Jy8LJgSqJ-B673GnTYIG4b7XRw2ZLtvvSlrqFcl4A/edit}, accessed 2020-12-15}, +} + +@article{meiklejohn, + author = {Sarah Meiklejohn and Pavel Kalinnikov and Cindy S. Lin and Martin Hutchinson and Gary Belvin and Mariana Raykova and Al Cutter}, + title = {Think Global, Act Local: Gossip and Client Audits in Verifiable Data Structures}, + journal = {CoRR}, + volume = {abs/2011.04551}, + year = {2020}, +} + +@misc{sfo-dist, + author = {Rasmus Dahlberg and Tobias Pulls and Tom Ritter and Paul Syverson}, + title = {{SFO} Distribution Artificat}, + year = {2020}, + howpublished = {\url{https://github.com/rgdd/ctor/tree/master/artifact}}, +} + +@misc{ct-policy-mailing-list, + author = {{CT policy mailing list}}, + title = {{Certificate Transparency} Policy}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/\#!forum/ct-policy}, accessed 2020-12-15}, +} + +@misc{no-hard-fail, + author = {Adam Langley}, + title = {No, don't enable revocation checking}, + howpublished = {\url{https://www.imperialviolet.org/2014/04/19/revchecking.html}, accessed 2020-12-15}, +} + +@misc{de-anonymize-exploit, + author = {Joseph Cox}, + title = {The {FBI} Used a 'Non-Public' Vulnerability to Hack Suspects on {Tor}}, + howpublished = {\url{https://www.vice.com/en_us/article/kb7kza/the-fbi-used-a-non-public-vulnerability-to-hack-suspects-on-tor}, accessed 2020-12-15}, +} + +@Misc{forbes-fbi-tor, + author = {Kashmir Hill}, + title = {How Did The {FBI} Break {Tor}?}, + howpublished = {\url{https://www.forbes.com/sites/kashmirhill/2014/11/07/how-did-law-enforcement-break-tor/#6cf2ed594bf7}, accessed 2020-12-15}, +} + + +@Misc{doj-fbi-tor, + author = {{U.S. Dept. of Justice}}, + title = {More Than 400 .Onion Addresses, Including Dozens of ‘Dark Market’ Sites, Targeted as Part of Global Enforcement Action on {Tor} Network}, + howpublished = {\url{https://www.fbi.gov/news/pressrel/press-releases/more-than-400-.onion-addresses-including-dozens-of-dark-market-sites-targeted-as-part-of-global-enforcement-action-on-tor-network}, accessed 2020-12-15}, +} + + +@Misc{syria-facebook-mitm, + author = {Peter Eckersley}, + title = {A {Syrian} Man-In-The-Middle Attack against {Facebook}}, + howpublished = {\url{https://www.eff.org/deeplinks/2011/05/syrian-man-middle-against-facebook}, accessed 2020-12-15}, +} + +@misc{wiki-bgp, + author = {{Wikipedia contributors}}, + title = {{BGP} hijacking---{Wikipedia}{,} The Free Encyclopedia}, + howpublished = {\url{https://en.wikipedia.org/w/index.php?title=BGP_hijacking&oldid=964360841}, accessed 2020-12-15}, +} + +@misc{bgp-hijacking-for-crypto-2, + author = {Ameet Naik}, + title = {Anatomy of a {BGP} Hijack on {Amazon’s} Route 53 {DNS} Service}, + howpublished = {\url{https://blog.thousandeyes.com/amazon-route-53-dns-and-bgp-hijack}, accessed 2020-12-15}, +} + +@misc{bgp-hijacking-for-crypto, + author = {Joe Stewart}, + title = {{BGP} Hijacking for Cryptocurrency Profit}, + howpublished = {\url{https://www.secureworks.com/research/bgp-hijacking-for-cryptocurrency-profit}, accessed 2020-12-15}, +} + +@misc{myetherwallet, + author = {Russell Brandom}, + title = {Hackers emptied {Ethereum} wallets by breaking the basic infrastructure of the {Internet}}, + howpublished = {\url{https://www.theverge.com/2018/4/24/17275982/myetherwallet-hack-bgp-dns-hijacking-stolen-ethereum}, accessed 2020-12-15}, +} + +@Misc{ethereum-hijack-isoc, + author = {Aftab Siddiqui}, + title = {What Happened? {The Amazon Route 53 BGP} Hijack to Take Over {Ethereum} Cryptocurrency Wallets}, + howpublished = {\url{https://www.internetsociety.org/blog/2018/04/amazons-route-53-bgp-hijack/}, accessed 2020-12-15}} + +@Misc{iran-telegram-bgp, + author = {Patrick Howell O'Neill}, + title = {Telegram traffic from around the world took a detour through {Iran}}, + howpublished = {\url{https://www.cyberscoop.com/telegram-iran-bgp-hijacking/}, accessed 2020-12-15}, +} + +@misc{google-log-policy, + author = {{Google LLC.}}, + title = {Chromium {Certificate Transparency} Policy}, + howpublished = {\url{https://github.com/chromium/ct-policy/blob/master/README.md}, accessed 2020-12-15}, +} + +@misc{apple-log-policy, + author = {{Apple Inc.}}, + title = {Apple's {Certificate Transparency} log program}, + howpublished = {\url{https://support.apple.com/en-om/HT209255}, accessed 2020-12-15}, +} + +@misc{tor-bandwidth, + author = {{Tor project}}, + title = {Advertised and consumed bandwidth by relay flag}, + howpublished = {\url{https://metrics.torproject.org/bandwidth-flags.html}, accessed 2020-05-30}, +} + +@misc{relay-by-flag, + author = {{Tor project}}, + title = {Relays by relay flag}, + howpublished = {\url{https://metrics.torproject.org/relayflags.html}, accessed 2020-05-29}, +} + +@misc{relay-config, + author = {{Tor project}}, + title = {Relay requirements}, + howpublished = {\url{https://community.torproject.org/relay/relays-requirements/}, accessed 2020-05-29}, +} + +@misc{turktrust, + author = {Adam Langley}, + title = {Enhancing digital certificate security}, + howpublished = {\url{https://security.googleblog.com/2013/01/enhancing-digital-certificate-security.html}, accessed 2020-12-15}, +} + +@inproceedings{doublecheck, + author = {Mansoor Alicherry and Angelos D. Keromytis}, + title = {{DoubleCheck}: Multi-path verification against man-in-the-middle attacks}, + booktitle = {ISCC}, + year = {2009}, +} + +@misc{consensus-transparency, + author = {Linus Nordberg}, + title = {{Tor} Consensus Transparency}, + howpublished = {\url{https://gitlab.torproject.org/tpo/core/torspec/-/blob/main/proposals/267-tor-consensus-transparency.txt}, accessed 2020-12-15}, +} + +@misc{sth-push, + author = {Ryan Sleevi and Eran Messeri}, + title = {Certificate transparency in {Chrome}: Monitoring {CT} Logs consistency}, + howpublished = {\url{https://docs.google.com/document/d/1FP5J5Sfsg0OR9P4YT0q1dM02iavhi8ix1mZlZe_z-ls/edit?pref=2&pli=1}, accessed 2020-12-15}, +} + +@misc{minimal-gossip, + author = {{Google LLC.}}, + title = {Minimal Gossip}, + howpublished = {\url{https://github.com/google/trillian-examples/blob/master/gossip/minimal/README.md}, accessed 2020-12-15}, +} + +@inproceedings{catena, + author = {Alin Tomescu and Srinivas Devadas}, + title = {Catena: Efficient Non-equivocation via {Bitcoin}}, + booktitle = {IEEE S\&P}, + year = {2017}, +} + +@inproceedings{chase, + author = {Melissa Chase and Sarah Meiklejohn}, + title = {Transparency Overlays and Applications}, + booktitle = {CCS}, + year = {2016}, +} + +@inproceedings{kales, + author = {Daniel Kales and Olamide Omolola and Sebastian Ramacher}, + title = {Revisiting User Privacy for {Certificate Transparency}}, + booktitle = {IEEE EuroS\&P}, + year = {2019}, +} + +@inproceedings{lueks-and-goldberg, + author = {Wouter Lueks and Ian Goldberg}, + title = {Sublinear Scaling for Multi-Client Private Information Retrieval}, + booktitle = {FC}, + year = {2015}, +} + +@misc{ct-over-dns, + author = {Ben Laurie}, + title = {{Certificate Transparency} over {DNS}}, + howpublished = {\url{https://github.com/google/certificate-transparency-rfcs/blob/master/dns/draft-ct-over-dns.md}, accessed 2020-12-15}, +} + +@inproceedings{lwm, + author = {Rasmus Dahlberg and Tobias Pulls}, + title = {Verifiable Light-Weight Monitoring for {Certificate Transparency} Logs}, + booktitle = {NordSec}, + year = {2018}, +} + +@article{ct-with-privacy, + author = {Saba Eskandarian and Eran Messeri and Joseph Bonneau and Dan Boneh}, + title = {{Certificate Transparency} with Privacy}, + journal = {PETS}, + volume = {2017}, + number = {4}, +} + +@inproceedings{ct-monitors, + author = {Bingyu Li and Jingqiang Lin and Fengjun Li and Qiongxiao Wang and Qi Li and Jiwu Jing and Congli Wang}, + title = {{Certificate Transparency} in the Wild: Exploring the Reliability of Monitors}, + booktitle = {CCS}, + year = {2019}, +} + +@inproceedings{syta, + author = {Ewa Syta and Iulia Tamas and Dylan Visher and David Isaac Wolinsky and Philipp Jovanovic and Linus Gasser and Nicolas Gailly and Ismail Khoffi and Bryan Ford}, + title = {Keeping Authorities "Honest or Bust" with Decentralized Witness Cosigning}, + booktitle = {IEEE S\&P}, + year = {2016}, +} + +@inproceedings{dahlberg, + author = {Rasmus Dahlberg and Tobias Pulls and Jonathan Vestin and Toke H{\o}iland-J{\o}rgensen and Andreas Kassler}, + title = {Aggregation-Based {Certificate Transparency} Gossip}, + booktitle = {SECURWARE}, + year = {2019}, +} + +@inproceedings{secure-logging-and-ct, + author = {Benjamin Dowling and Felix G{\"{u}}nther and Udyani Herath and Douglas Stebila}, + title = {Secure Logging Schemes and {Certificate Transparency}}, + booktitle = {ESORICS}, + year = {2016}, +} + +@misc{tor-browser, + author = {Mike Perry and Erinn Clark and Steven Murdoch and Georg Koppen}, + title = {The Design and Implementation of the {Tor Browser [DRAFT]}}, + howpublished = {\url{https://2019.www.torproject.org/projects/torbrowser/design/}, accessed 2020-12-15}, +} + +@inproceedings{mani, + author = {Akshaya Mani and T. Wilson{-}Brown and Rob Jansen and Aaron Johnson and Micah Sherr}, + title = {Understanding {Tor} Usage with Privacy-Preserving Measurement}, + booktitle = {IMC}, + year = {2018}, +} + +@inproceedings{ct-root-landscape, + author = {Nikita Korzhitskii and Niklas Carlsson}, + title = {Characterizing the Root Landscape of {Certificate Transparency} Logs}, + booktitle = {IFIP Networking}, + year = {2020}, +} + +@inproceedings{spoiled-onions, + author = {Philipp Winter and Richard K{\"{o}}wer and Martin Mulazzani and Markus Huber and Sebastian Schrittwieser and Stefan Lindskog and Edgar R. Weippl}, + title = {Spoiled Onions: Exposing Malicious {Tor} Exit Relays}, + booktitle = {PETS}, + year = {2014}, +} + +@misc{gdca1-omission, + title = {Un-incorporated {SCTs} from {GDCA1}}, + author = {Brendan McMillion}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/Emh3ZaU0jqI}, accessed 2020-12-15}, +} + +@misc{digicert-log-compromised, + title = {{CT2} Log Compromised via {Salt} Vulnerability}, + author = {Jeremy Rowley}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/aKNbZuJzwfM}, accessed 2020-12-15}, +} + +@misc{izenpe-disqualified, + title = {Upcoming {CT} Log Removal: {Izenpe}}, + author = {Ryan Sleevi}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/qOorKuhL1vA}, accessed 2020-12-15}, +} + +@misc{venafi-disqualified, + title = {Upcoming Log Removal: {Venafi} {CT} Log Server}, + author = {Ryan Sleevi}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/KMAcNT3asTQ}, accessed 2020-12-15}, +} + +@inproceedings{does-ct-break-the-web, + author = {Emily Stark and Ryan Sleevi and Rijad Muminovic and Devon O'Brien and Eran Messeri and Adrienne Porter Felt and Brendan McMillion and Parisa Tabriz}, + title = {Does {Certificate Transparency} Break the Web? {Measuring} Adoption and Error Rate}, + booktitle = {IEEE S\&P}, + year = {2019}, +} + +@inproceedings{https-sok, + author = {Jeremy Clark and Paul C. van Oorschot}, + title = {{SoK:} {SSL} and {HTTPS:} Revisiting Past Challenges and Evaluating Certificate Trust Model Enhancements}, + booktitle = {IEEE S\&P}, + year = {2013}, +} + +@inproceedings{ca-ecosystem, + author = {Zakir Durumeric and James Kasten and Michael Bailey and J. Alex Halderman}, + title = {Analysis of the {HTTPS} certificate ecosystem}, + booktitle = {IMC}, + year = {2013}, +} + +@article{ct/a, + author = {Ben Laurie}, + title = {Certificate transparency}, + journal = {CACM}, + volume = {57}, + number = {10}, + year = {2014}, +} + +@inproceedings{tor, + author = {Roger Dingledine and Nick Mathewson and Paul F. Syverson}, + title = {Tor: The Second-Generation Onion Router}, + booktitle = {USENIX Security}, + year = {2004}, +} + +@misc{rapid-tls13, + author = {Joseph A.\ Salowey and Sean Turner and Christopher A.\ Wood}, + title = {{TLS} 1.3: One Year Later}, + howpublished = {\url{https://www.ietf.org/blog/tls13-adoption}, accessed 2020-12-15}, +} + +@misc{chrome-ui, + author = {Emily Schechter}, + title = {Evolving {Chrome's} Security Indicators}, + howpublished = {\url{https://blog.chromium.org/2018/05/evolving-chromes-security-indicators.html}, accessed 2020-12-15}, +} + +@misc{firefox-ui, + author = {Johann Hofmann}, + title = {Improved Security and Privacy Indicators in {Firefox} 70}, + howpublished = {\url{https://blog.mozilla.org/security/2019/10/15/improved-security-and-privacy-indicators-in-firefox-70/}, accessed 2020-12-15} +} + +@inproceedings{le, + author = {Josh Aas and Richard Barnes and Benton Case and Zakir Durumeric and Peter Eckersley and Alan Flores{-}L{\'{o}}pez and J. Alex Halderman and Jacob Hoffman{-}Andrews and James Kasten and Eric Rescorla and Seth D. Schoen and Brad Warren}, + title = {{Let's Encrypt}: An Automated Certificate Authority to Encrypt the Entire Web}, + booktitle = {CCS}, + year = {2019}, +} + +@misc{google-metrics, + author = {{Google LLC}}, + title = {{HTTPS} encryption on the web}, + howpublished = {\url{https://transparencyreport.google.com/https/overview?hl=en}, accessed 2020-05-19}, +} + +@misc{mozilla-metrics, + author = {{Mozilla}}, + title = {{SSL} Ratios}, + howpublished = {\url{https://docs.telemetry.mozilla.org/datasets/other/ssl/reference.html}, accessed 2020-05-19}, +} + +@techreport{nordberg, + author = {Linus Nordberg and Daniel Kahn Gillmor and Tom Ritter}, + title = {Gossiping in {CT}}, + number = {draft-ietf-trans-gossip-05}, + type = {Internet-draft}, + institution = {IETF}, + year = {2018}, + url = {https://tools.ietf.org/html/draft-ietf-trans-gossip-05} +} + +@techreport{ct, + author = {Ben Laurie and Adam Langley and Emilia Kasper}, + title = {{Certificate Transparency}}, + number = {6962}, + type = {RFC}, + institution = {IETF}, + year = {2013}, + url = {https://tools.ietf.org/html/rfc6962}, +} + +@techreport{ct/bis, + author = {Ben Laurie and Adam Langley and Emilia Kasper and Eran Messeri and Rob Stradling}, + title = {{Certificate Transparency} Version 2.0}, + number = {draft-ietf-trans-rfc6962-bis-34}, + type = {Internet-draft}, + institution = {IETF}, + year = {2019}, + url = {https://tools.ietf.org/html/draft-ietf-trans-rfc6962-bis-34}, +} + +@techreport{hpkp, + author = {Chris Evans and Chris Palmer and Ryan Sleevi}, + title = {Public Key Pinning Extension for {HTTP}}, + number = {7469}, + type = {RFC}, + institution = {IETF}, + year = {2015}, + url = {https://tools.ietf.org/html/rfc7469}, +} + +@inproceedings{chuat, + author = {Laurent Chuat and Pawel Szalachowski and Adrian Perrig and Ben Laurie and Eran Messeri}, + title = {Efficient Gossip Protocols for Verifying the Consistency of Certificate Logs}, + booktitle = {CNS}, + year = {2015}, +} + +@inproceedings{TorDNS, + author = {Benjamin Greschbach and Tobias Pulls and Laura M. Roberts and Philipp Winter and Nick Feamster}, + title = {The Effect of {DNS} on {Tor's} Anonymity}, + booktitle = {NDSS}, + year = {2017}, +} + +@inproceedings{trickle02, + author = {Andrei Serjantov and Roger Dingledine and Paul Syverson}, + title = {From a Trickle to a Flood: Active Attacks on Several Mix Types}, + booktitle = {IH}, + year = {2002}, +} + +@inproceedings{kesdogan:ih1998, + title = {{Stop-and-Go} {MIX}es: Providing Probabilistic Anonymity in an Open System}, + author = {Dogan Kesdogan and Jan Egner and Roland B\"uschkes}, + booktitle = {IH}, + year = {1998}, +} + +@inproceedings{danezis:pets2008, + author = {George Danezis and Paul Syverson}, + title = {Bridging and Fingerprinting: Epistemic Attacks on Route Selection}, + booktitle = {PETS}, + year = {2008}, +} + +@inproceedings{long-paths, + author = {Nathan S. Evans and Roger Dingledine and Christian Grothoff}, + title = {A Practical Congestion Attack on {Tor} Using Long Paths}, + booktitle = {USENIX Security}, + year = {2009}, +} + + +@misc{tor-documentation, + author = {{Tor Project}}, + title = {Getting up to speed on {Tor's} past, present, and future}, + howpublished = {\url{https://2019.www.torproject.org/docs/documentation.html.en}, accessed 2020-12-15}, +} + +@inproceedings{PIR, + author = {Benny Chor and Oded Goldreich and Eyal Kushilevitz and Madhu Sudan}, + title = {Private Information Retrieval}, + booktitle = {FOCS}, + year = {1995}, +} + +@inproceedings{DBLP:conf/pam/AmannS16, + author = {Johanna Amann and Robin Sommer}, + title = {Exploring {Tor's} Activity Through Long-Term Passive {TLS} Traffic Measurement}, + booktitle = {PAM}, + year = {2016}, +} + +@inproceedings{1mtrack, + author = {Steven Englehardt and Arvind Narayanan}, + title = {Online Tracking: A 1-million-site Measurement and Analysis}, + booktitle = {CCS}, + year = {2016}, +} + +@techreport{diginotar, + author = {J.R. Prins}, + title = {{DigiNotar} Certificate Authority breach “Operation Black Tulip”}, + institution = {Fox-IT}, + year = {2011}, + type = {Interim Report}, +} + +@misc{ffct, + author = {{Bugzilla}}, + title = {Implement {Certificate Transparency} support ({RFC} 6962)}, + howpublished = {\url{https://bugzilla.mozilla.org/show_bug.cgi?id=1281469}, accessed 2020-12-15}, +} + +@misc{fftor, + author = {{Mozilla}}, + title = {Mozilla Research Grants {2019H1}}, + howpublished = {\url{https://mozilla-research.forms.fm/mozilla-research-grants-2019h1/forms/6510}, accessed 2020-12-15}, +} + +@misc{zerotor, + author = {{Zerodium}}, + title = {{Tor Browser} Zero-Day Exploit Bounty (Expired)}, + howpublished = {\url{https://zerodium.com/tor.html}, accessed 2020-12-15}, +} + +@misc{zeromain, + author = {{Zerodium}}, + title = {Our Exploit Acquisition Program}, + howpublished = {\url{https://zerodium.com/program.html}, accessed 2020-05-21}, +} + +@misc{lepop1, + author = {{Catalin Cimpanu}}, + title = {Exploit vendor drops {Tor Browser} zero-day on {Twitter}}, + howpublished = {\url{https://www.zdnet.com/article/exploit-vendor-drops-tor-browser-zero-day-on-twitter/}, accessed 2020-12-15}, +} + +@misc{lepop2, + author = {{firstwatch at sigaint.org}}, + title = {[tor-talk] Javascript exploit}, + howpublished = {\url{https://lists.torproject.org/pipermail/tor-talk/2016-November/042639.html}, accessed 2020-12-15}, +} + +@article{selfrando, + author = {Mauro Conti and Stephen Crane and Tommaso Frassetto and Andrei Homescu and Georg Koppen and Per Larsen and Christopher Liebchen and Mike Perry and Ahmad{-}Reza Sadeghi}, + title = {Selfrando: Securing the {Tor Browser} against De-anonymization Exploits}, + journal = {PETS}, + volume = {2016}, + number = {4}, +} diff --git a/summary/src/ctor/src/related.tex b/summary/src/ctor/src/related.tex new file mode 100644 index 0000000..cc5ae60 --- /dev/null +++ b/summary/src/ctor/src/related.tex @@ -0,0 +1,80 @@ +\section{Related Work} \label{ctor:sec:related} +The status quo is to consider a certificate CT compliant if it is accompanied by +two independent SCTs~\cite{google-log-policy,apple-on-independence}. Therefore we +proposed that Tor Browser should do the same, but unlike any other CT-enforcing +web browser CTor also provides concrete next steps that relax the centralized +trust which is otherwise misplaced in CT logs~\cite{% + gdca1-omission,% + digicert-log-compromised,% + izenpe-disqualified,% + venafi-disqualified% +}. Several proposals surfaced that aim to do better with regards to omissions +and split-views. + +% Privacy preserving inclusion proofs +Laurie proposed that inclusion proofs could be fetched over DNS to avoid +additional privacy leaks, i.e., a user's browsing patterns are already exposed +to the DNS resolver but not the logs in the CT landscape~\cite{ct-over-dns}. +CT/bis provides the option of serving stapled inclusion proofs as part of the +TLS handshake in an extension, an OCSP response, or the certificate +itself~\cite{ct/bis}. Lueks and Goldberg proposed that a separate database of +inclusion proofs could be maintained that supports information-theoretic +PIR~\cite{lueks-and-goldberg}. Kales~\emph{et~al.} improved scalability by +reducing the size of each entry in the PIR database at the cost of transforming +logs into multi-tier Merkle trees, and additionally showed how the upper tier +could be expressed as a two-server computational PIR database to ensure that any +inclusion proof can be computed privately on-the-fly~\cite{kales}. +Nordberg~\emph{et~al.} avoid inclusion proof fetching by hanging on to presented +SFOs, handing them back to the same origin at a later time~\cite{nordberg}. In +contrast, CTor protects the user's privacy without any persistent browser state +by submitting SFOs on independent Tor circuits to CTRs, which in turn add random +noise before there is any log interaction. The use of CTRs enable caching +similar to CT-over-DNS, but it does not put the logs in the dark like PIR could. + +% The same consistent view +Inclusion proofs are only meaningful if everyone observes the same consistent +STHs. One option is to configure client software with a list of entities that +they should gossip with, e.g., CT monitors~\cite{chase}, or, browser vendors +could push a verified view~\cite{sth-push}. Such trusted auditor relationships +may work for some but not others~\cite{nordberg}. Chuat~\emph{et~al.} proposed +that HTTPS clients and HTTPS servers could pool STHs and consistency proofs, +which are gossiped on website visits~\cite{chuat}. Nordberg~\emph{et~al.} +suggested a similar variant, reducing the risk of user tracking by pooling fewer +and recent STHs~\cite{nordberg}. Dahlberg~\emph{et~al.} noted that such +privacy-insensitive STHs need not be encrypted, which could enable network +operators to use programmable data planes to provide gossip +as-a-service~\cite{dahlberg}. Syta~\emph{et~al.} proposed an alternative to +reactive gossip mechanisms by showing how an STH can be cosigned efficiently by +many independent witnesses~\cite{syta}. A smaller-scale version of witness +cosigning could be instantiated by cross-logging STHs in other CT +logs~\cite{minimal-gossip}, or in other append-only ledgers~\cite{catena}. +CTor's full design (Section~\ref{ctor:sec:base}) ensures that anyone connected to the +Tor network is on the same view by making STHs public in the Tor consensus. In +contrast, the first incremental design (Section~\ref{ctor:sec:incremental}) is not +concerned with catching log misbehavior, while the second incremental design +(also Section~\ref{ctor:sec:incremental}) exposes misbehaving logs \emph{without} +first trying to fetch inclusion proofs. + +% Other work that is closely related to our approach +Nordberg proposed that Tor clients could enforce public logging of consensus +documents and votes~\cite{consensus-transparency}. Such an initiative is mostly +orthogonal to CTor, as it strengthens the assumption of a secure Tor consensus +by enabling detection of compromised signing keys rather than mis-issued TLS +certificates. Winter~\emph{et~al.} proposed that Tor Browser could check +self-signed TLS certificates for exact matches on independent Tor +circuits~\cite{spoiled-onions}. Alicherry~\emph{et~al.} proposed that any web +browser could double-check TLS certificates on first encounter using alternative +paths and Tor, again, looking for certificate mismatches and generating warnings +of possible man-in-the-middle attacks~\cite{doublecheck}. The submission phase +in CTor is similar to double-checking, except that there are normally no TLS +handshake blocking, browser warnings, or strict assumptions regarding the +attacker's location. + +% Parallel to our work +In parallel Stark and Thompson proposed that Chrome could submit a random subset +of encountered SCTs to a trusted auditor that Google runs~\cite{stark}. CTor +also propagates a random subset of SCTs to a trusted auditor, but does so while +preserving privacy because of and how Tor is used. Meiklejohn additionally +proposed witness cosigning on-top of consistent STHs~\cite{meiklejohn}. CTor +adds signatures on-top of STHs too, but only as part of the Tor consensus that +directory authorities sign. diff --git a/summary/src/introduction/img/contribs.pdf b/summary/src/introduction/img/contribs.pdf new file mode 100644 index 0000000..a7baa39 Binary files /dev/null and b/summary/src/introduction/img/contribs.pdf differ diff --git a/summary/src/introduction/img/contribs.svg b/summary/src/introduction/img/contribs.svg new file mode 100644 index 0000000..c05e93d --- /dev/null +++ b/summary/src/introduction/img/contribs.svg @@ -0,0 +1,2213 @@ + + + +KeyingOperation using a key-driven device, e.g. typing. (IBM)Auxiliary OperationOffline operation. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Paper IPaper IIPaper IIIPaper IVPaper VPaper VIC1C2C3C4C5C6RQ1RQ2RQ3 diff --git a/summary/src/introduction/img/ct.pdf b/summary/src/introduction/img/ct.pdf new file mode 100644 index 0000000..bb14266 Binary files /dev/null and b/summary/src/introduction/img/ct.pdf differ diff --git a/summary/src/introduction/img/ct.svg b/summary/src/introduction/img/ct.svg new file mode 100644 index 0000000..bd4641f --- /dev/null +++ b/summary/src/introduction/img/ct.svg @@ -0,0 +1,1346 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + Logs + + + + + + + + + + + + + + + + + + Monitor + + + + + + + + + + + + + + + + + + + + + + + + + + + + Website + + + + + + + + + + + + + + + + + + Browser + + + + + + initiate website visit + certificate + certificate included? + proof + + + continuousdownload + + + + diff --git a/summary/src/introduction/main.tex b/summary/src/introduction/main.tex new file mode 100644 index 0000000..da013ab --- /dev/null +++ b/summary/src/introduction/main.tex @@ -0,0 +1,826 @@ +\section{Introduction} + +The security posture of the Internet increased significantly throughout the +last decade. For example, + the cleaned-up and formally verified TLS 1.3 protocol that underpins HTTPS + has been rolled-out gradually~\cite{tls-timeline}, + the certificates that specify which public keys to use when bootstrapping a + secure connection can be obtained for free and automatically~\cite{le}, and + web browsers have shifted from positive to negative security indicators in + favor of security-by-default~\cite{browser-ui}. +The use of end-to-end encryption has further become the norm with services such +as + DNS-over-HTTPS~\cite{rfc8484}, + virtual private networks~\cite{wireguard}, + Tor~\cite{tor}, and + secure messaging~\cite{mls} +gaining traction. In other words, the era of attackers that can passively snoop +and actively tamper with unencrypted~network~traffic~is~over. + +What will remain the same is the incentive for attackers to snoop and tamper +with network traffic. Therefore, the focus is (and will likely continue to be) +on circumventing protocols that add security and privacy as they are deployed +in the real world. For example, there is a long history of certificate +mis-issuance that allows attackers to impersonate websites and thus insert +themselves as machines-in-the-middle (``MitM'') without actually breaking +TLS~\cite{sok-https,sslmate-history}. Or, in the case of encrypted channels +that are hard to intercept, instead analyzing traffic patterns to infer user +activity like which website is being +visited~\cite{cheng98,herrmann09,hintz02,liberatore06,panchenko11,sun02}. The +bad news is that attackers only need to find one vulnerability in a deployed +protocol or its software. Sometimes, such vulnerabilities can be purchased by +zero-day brokers like Zerodium~\cite{zerodium}. + +To address an attack vector, it is common to add countermeasures that frustrate +attackers and/or increase the risk involved while trying to exploit a system. +A good example is how the certificate authority ecosystem evolved. For +background, certificate authorities are trusted parties that validate domain +names before issuing certificates that list their public keys. Web browsers +are shipped with hundreds of trusted certificate authorities, which means that +the resulting TLS connections cannot be more secure than the difficulty of +hijacking the weakest-link certificate authority~\cite{sok-https}. A proposal +eventually deployed to mitigate this issue is Certificate Transparency: an +ecosystem of public append-only logs that publishes all certificates so that +any mis-issuance can be \emph{detected} by monitors~\cite {ct,rfc6962}. +These logs have a cryptographic foundation that holds them and the issuing +certificate authorities \emph{accountable}, at least in theory. In practice, +the logs are essentially trusted parties that must act honestly due to how web +browsers shape their policies to respect user +privacy~\cite{apple-log-policy,google-log-policy,sok-sct-auditing,ct-history}. + +The first objective of this thesis is to better understand the current limits +of Certificate Transparency by proposing and evaluating improvements which +\emph{reduce} the amount of trust that needs to be placed in third-party +monitors and logs. We make a dent in the problem of Certificate Transparency +verification both generally and concretely in the context of Tor Browser, which +unlike Google Chrome and Apple's Safari does not support Certificate +Transparency yet. For context, Tor Browser is a fork of +Mozilla's Firefox that (among other things) routes user traffic through the +low-latency anonymity network Tor~\cite{tor,tb}. As part of our pursuit to +improve the status quo for Certificate Transparency verification in Tor +Browser, the second objective of this thesis is to evaluate how the protocols +used during website visits affect unlinkability between senders (web browsers) +and receivers (websites). Our evaluation applies to our addition of +Certificate Transparency and other protocols already in use, e.g., + DNS, + real-time bidding~\cite{rtb}, and + certificate revocation checking~\cite{ocsp}. + +The remainder of the introductory summary is structured as follows. +Section~\ref{sec:background} introduces background that will help the reader + understand the context and preliminaries of the appended papers. +Section~\ref{sec:rqs} defines our research questions and overall objective. +Section~\ref{sec:methods} provides an overview of our research methods. +Section~\ref{sec:contribs} describes our contributions succinctly. +Section~\ref{sec:appended} summarizes the appended papers that are published in + NordSec (Paper~\ref{paper:lwm}), + SECURWARE (Paper~\ref{paper:ctga}), + PETS (Paper~\ref{paper:ctor} and~\ref{paper:cat}), + WPES (Paper~\ref{paper:sauteed}), and + USENIX Security (Paper~\ref{paper:tlwo}). +Section~\ref{sec:related} positions our contributions with regard to related +work. Section~\ref{sec:concl} concludes and briefly discusses future work. + +\section{Background} \label{sec:background} + +This section introduces background on Certificate Transparency and Tor. + +\subsection{Certificate Transparency} + +The web's public-key infrastructure depends on certificate authorities to issue +certificates that map domain names to public keys. For example, the +certificate of \texttt{www.example.com} is issued by DigiCert and lists a +2048-bit RSA key~\cite{crt:www.example.com}. The fact that DigiCert signed +this certificate means that they claim to have verified that the requesting +party is really \texttt{www.example.com}, typically by first ensuring that a +specified DNS record can be uploaded on request~\cite{ca/b}. If all +certificate authorities performed these checks correctly and the checks +themselves were fool-proof, a user's browser could be sure that any +certificate signed by a certificate authority would list a verified public key +that can be used for authentication when connecting to a website via TLS. +Unfortunately, there are hundreds of trusted certificate authorities and a long +history of issues surrounding their operations in +practice~\cite{bambo-cas,sok-https,sslmate-history}. One of the most famous +incidents took place in 2011: an attacker managed to mis-issue certificates from +DigiNotar to intercept traffic towards Google and others in +Iran~\cite{black-tulip}. The astonishing part is that this incident was first +detected \emph{seven weeks later}. + +Certificate Transparency aims to facilitate \emph{detection} of issued +certificates, thus holding certificate authorities \emph{accountable} for any +certificates that they mis-issue~\cite{ct,rfc6962}. The basic idea is shown in +Figure~\ref{fig:ct-idea}. In addition to regular validation rules, browsers +ensure certificates are included in a public append-only Certificate +Transparency \emph{log}. +This allows anyone to get a concise view of all certificates that users +may encounter, including domain owners like Google who can then see for +themselves whether any of the published certificates are mis-issued. The +parties inspecting the logs are called \emph{monitors}. Some monitors mirror +all log entries~\cite{crt.sh}, while others discard most of them in pursuit of +finding matches for pre-defined criteria like +\texttt{*.example.com}~\cite{certspotter}. Another option is subscribing to +certificate notifications from a trusted third-party~\cite{ct-monitors}. + +\begin{figure}[!t] + \centering\includegraphics[width=0.8\textwidth]{src/introduction/img/ct} + \caption{% + The idea of Certificate Transparency. Certificates encountered by users + must be included in a public log so that monitors can detect mis-issuance. + } + \label{fig:ct-idea} +\end{figure} + +What makes Certificate Transparency a significant improvement compared to the +certificate authority ecosystem is that the logs stand on a cryptographic +foundation that can be verified. A log can be viewed as an append-only +tamper-evident list of certificates. It is efficient\footnote{% + Efficient refers to space-time complexity $\mathcal{O}(\mathsf{log}(n))$, + where $n$ is the number of log entries. +} to prove cryptographically that a certificate is in the list, and that a +current version of the list is append-only with regard to a previous version +(i.e., no tampering or reordering).\footnote{% + Interested readers can refer to our Merkle tree and proof technique + introduction online~\cite{merkle-intro}. +} These properties follow from using a Merkle tree structure that supports +\emph{inclusion} and \emph{consistency} +proofs~\cite{history-trees,ct-formal,rfc6962,merkle}. +The reader +only needs to know that these proofs are used to reconstruct a log's Merkle tree +head, often referred to as a \emph{root hash}. It is a cryptographic hash +identifying a list of certificates uniquely in a tree data structure. The logs +sign root hashes with the number of entries and a timestamp to form \emph{signed +tree heads}. So, if an inconsistency is discovered, it cannot be denied. Log +operators are therefore held accountable for maintaining the append-only +property. A party that verifies the efficient transparency log proofs without +downloading all the logs is called an \emph{auditor}. + +A log that signs two inconsistent tree heads is said to perform a +\emph{split-view}. To ensure that everyone observes the same append-only logs, +all participants of the Certificate Transparency ecosystem must engage in a +\emph{gossip protocol}~\cite{chuat,nordberg}. In other words, just because +Alice observes an append-only log, it is not necessarily the \emph{same +append-only log} that Bob observes. Therefore, Alice and Bob must exchange +signed tree heads and verify consistency to assert that the log operators play +by the rules and only append certificates. Without a secure gossip protocol, +log operators would have to be trusted blindly (much like certificate +authorities before Certificate Transparency). RFC~6962 defers the specification +of gossip~\cite{rfc6962}, with little or no meaningful gossip deployed yet. + +Rolling out Certificate Transparency without breakage on the web is a +challenge~\cite{does-ct-break-the-web}. Certificates must be logged, associated +proofs delivered to end-user software, and more. One solution RFC~6962 +ultimately put forth was the introduction of \emph{signed certificate +timestamps}. A signed certificate timestamp is a log's \emph{promise} that a +certificate will be appended to the log within a \emph{maximum merge delay} +(typically 24 hours). Verifying if a log holds its promise is +usually called \emph{auditing}. +Certificate authorities can obtain signed certificate +timestamps and embed them in their final certificates by logging a +\emph{pre-certificate}. As such, there is no added latency from building the +underlying Merkle tree and no need for server software to be updated (as the +final certificate contains the information needed). The current policy for +Google Chrome and Apple's Safari is to reject certificates with fewer than two +signed certificate timestamps~\cite{apple-log-policy,google-log-policy}. How to +request an inclusion proof for a promise without leaking the user's browsing +history to the log is an open problem~\cite{sok-sct-auditing}. In other words, +asking for an inclusion proof trivially reveals the certificate of interest to +the log. + +Other than embedding signed certificate timestamps in certificates, they can be +delivered dynamically to end-users in TLS extensions and stapled certificate +status responses. For example, Cloudflare uses the TLS extension delivery +method to recover from log incidents without their customers needing to acquire +new certificates~\cite{cloudflare-scts}. Several log incidents have already +happened in the past, ranging from +split-views~\cite{trustasia-err,izenpe-err,venafi-err} to broken promises of +timely logging~\cite{wosign-err,google-err,digicert-err,starcom-err} and +potential key compromise~\cite{digicert-kc}. These are all good \emph{scares} +motivating continued completion of Certificate Transparency in practice. + +In summary, the status quo is for web browsers to require at least two signed +certificate timestamps before accepting a certificate as valid. Merkle tree +proofs are not verified. Gossip is not deployed. The lack of a reliable +\emph{gossip-audit model} means that the logs are largely trusted +parties.\footnote{% + Historical remark: the lack of verification led Google to require that all + certificates be disclosed in at least one of their logs to + validate~\cite{ct-history}. The so-called \emph{one-Google log requirement} + was recently replaced. Google Chrome instead interacts with Google's trusted + auditor. See Section~\ref{sec:related}. +} We defer discussion of related work in the area of gossip-audit models until +Section~\ref{sec:related}. + +\subsection{Tor} + +The Tor Project is a 501(c)(3) US nonprofit that advances human rights and +defends privacy online through free software and open networks~\cite{tpo}. Some +of the maintained and developed components include Tor Browser and Tor's relay +software. Thousands of volunteers operate relays as part of the Tor network, +which routes the traffic of millions of daily users with low +latency~\cite{mani}. This frustrates attackers like Internet service providers +that may try linking \emph{who is communicating with whom} from their local +(non-global) vantage points~\cite{tor}. + +Usage of Tor involves tunneling the TCP traffic of different destinations (such +as all flows associated with a website visit to \texttt{example.com}) in +fixed-size \emph{cells} on independent \emph{circuits}. A circuit is built +through a guard, a middle, and an exit relay. At each hop of the circuit, one +layer of symmetric encryption is peeled off. The used keys are ephemeral and +discarded together with all other circuit state after at most 10 minutes (the +maximum circuit lifetime). This setup allows guard relays to observe users' IP +addresses but none of the destination traffic. In contrast, exit relays can +observe destination traffic but no user IP addresses. The relays used in a +circuit are determined by Tor's end-user software. Such path selection +is randomized and bandwidth-weighted but starts with a largely static guard set +to protect users from \emph{eventually} entering the network from a relay an +attacker volunteered to run. + +Tor's \emph{consensus} lists the relays that make up the network. As the name +suggests, it is a document agreed upon by a majority of trusted \emph{directory +authorities}. Five votes are currently needed to reach a consensus. Examples +of information added to the Tor consensus include tunable network parameters and +uploaded relay descriptors with relevant metadata, e.g., public key, available +bandwidth, and exit policy. Each relay in the consensus is also assigned +different flags based on their configuration and observed performance, e.g., +\texttt{Guard}, \texttt{MiddleOnly}, \texttt{Fast}, \texttt{Stable}, and +\texttt{HSDir}. The latter means that the relay is a \emph{hidden service +directory}, which refers to being part of a distributed hash table that helps +users lookup \emph{onion service introduction points}. + +An onion service is a self-authenticated server identified by its public key. +Onion services are only reachable through the Tor network. Users that are aware +of a server's \emph{onion address} can consult the distributed hash table to +find its introduction points. To establish a connection, a user builds a +circuit to a \emph{rendezvous point}. A request is then sent to one of the +current introduction points, which informs the onion service that it may build +its own circuit to meet the user at their rendezvous point. In total, six +relays are traversed while interacting with an onion service. This setup allows +not only the sender but also the receiver to be anonymous. The receiver also +benefits from a large degree of censorship resistance as the server location may +be hidden. The main drawback of onion services is that their non-mnemonic names +are hard to discover and remember. Some sites try to overcome this by setting +their onion addresses in \emph{onion location} HTTP headers or HTML +attributes~\cite{onion-location}. + +Many users use Tor Browser to connect to the Tor network. In addition to +routing traffic as described above, Tor Browser ships with privacy-preserving +features like first-party isolation to not share any state across +different origins, settings that frustrate browser fingerprinting, and +\emph{disk-avoidance} to not store browsing-related history as well as other +identifying information to disk~\cite{tb}. Tor Browser is a fork of Mozilla's +Firefox. Unfortunately, neither Firefox nor Tor Browser supports any form of +Certificate Transparency. Conducting undetected machine-in-the-middle attacks +against Tor users is thus relatively straightforward: compromise or coerce the +weakest-link certificate authority, then volunteer to operate an exit relay and +intercept network traffic. Such interception has previously been found with +self-signed certificates~\cite{spoiled-onions}. + +While global attackers are not within Tor's threat model, it is in scope to +guard against various local attacks~\cite{tor}. For example, the intended +attacker may passively observe a small fraction of the network and actively +inject their own packets. Figure~\ref{fig:wf} shows the typical attacker +setting of \emph{website fingerprinting}, where the attacker observes a user's +entry traffic with the goal of inferring which website was visited solely based +on analyzing encrypted +traffic~\cite{cheng98,herrmann09,hintz02,liberatore06,panchenko11,sun02}. +Website fingerprinting attacks are evaluated in the \emph{open-world} or +\emph{closed-world} settings. In the closed-world setting, the attacker +monitors (not to be confused with Certificate Transparency monitoring) a fixed +list of websites. A user visits one of the monitored sites, and the attacker +needs to determine which one. The open-world setting is the same as the +closed-world setting, except that the user may also visit unmonitored sites. +The practicality of website fingerprinting attacks is up for debate, e.g., +ranging from challenges handling false positives to machine-learning dataset +drift~\cite{onlinewf,juarez14,perryCrit,realistic}. + +\begin{figure}[!t] + \centering\includegraphics[width=0.85\textwidth]{src/cat/img/setting} + \caption{ + The setting of a website fingerprinting attack. A local passive attacker + analyzes a user's encrypted network traffic as it enters the network. The + goal is to infer which website is visited. (Figure reprinted from + Paper~\ref{paper:cat}.) + } + \label{fig:wf} +\end{figure} + +In summary, Tor is a low-latency anonymity network often accessed with Tor +Browser. Among the threats that Tor aims to protect against are local attackers +that see traffic as it enters or leaves the network (but not both at the same +time all the time). A website fingerprinting attack is an example of a passive +attack that operates on entry traffic. A machine-in-the-middle attack is an +example of an active attack that typically operates on exit traffic. Discussion +of related work in the area of website fingerprinting is deferred until +Section~\ref{sec:related}. + +\section{Research Questions} \label{sec:rqs} + +The overall research objective spans two different areas: transparency logs and +low-latency anonymity networks. We aim to reduce trust assumptions in +transparency log solutions and to apply such solutions in anonymous settings for +improved security and privacy. We defined the following research questions to +make this concrete in Certificate Transparency and Tor, the two ecosystems with +the most history and dominant positions in their respective areas. + +\begin{researchquestions} + \item[Can trust requirements in Certificate Transparency be reduced in + practice?] + + Transparency logs have a cryptographic foundation that supports efficient + verification of inclusion and consistency proofs. Such proofs are useful to + reduce the amount of trust that is placed in the logs. The roll-out of + Certificate Transparency has yet to start using these proofs, and to employ a + gossip protocol that ensures the same append-only logs are observed. Part of + the challenge relates to privacy concerns as parties interact with each other, + as well as deploying gradually without breakage. + + We seek practical solutions that reduce the trust requirements currently + placed in the logs and third-party monitors while preserving user privacy. + + \item[How can authentication of websites be improved in the context of Tor?] + + Tor Browser has yet to support Certificate Transparency to facilitate + detection of hijacked websites. This includes HTTPS sites but also onion + services that may be easier to discover reliably with more transparency. + + We seek incremental uses of Certificate Transparency in Tor that preserve user + privacy while engaging in new verification protocols to reduce trust. + + \item[How do the protocols used during website visits affect + unlinkability between Tor users and their destination websites?] + + Several third-parties become aware of a user's browsing activities while a + website is visited. For example, DNS resolvers and certificate status + responders may be consulted for domain name resolution and verification of if + a certificate has been revoked. Fetching an inclusion proof from a + Certificate Transparency log would reveal the same type of information. + + We seek to explore how unlinkability between Tor users and their exit + destinations is affected by the multitude of protocols used during website + visits. The considered setting is the same as in website fingerprinting, + except that the attacker may take additional passive and active measures. + For example, the attacker may volunteer to run a Certificate Transparency log + (passive) or inject carefully-crafted packets into Tor~(active). +\end{researchquestions} + +\section{Research Methods} \label{sec:methods} + +We tackle privacy and security problems in the field of computer +science~\cite{icss,smics}. Our work is applied, following the scientific method +for security and experimental networking research. \emph{Exactly} what it means +to use the scientific method in these areas is up for debate~\cite{rfenr,sse}. +However, at a glance, it is about forming precise and consistent theories with +falsifiable predictions \emph{as in other sciences} except that the objects of +study are \emph{information systems in the real world}. + +A prerequisite to formulating precise, consistent, and falsifiable theories is +that there are few implicit assumptions. Therefore, scientific security +research should be accompanied by definitions of security goals and attacker +capabilities: what does it mean that the system is secure, and what is the +attacker (not) allowed to do while attacking it~\cite{secdefs}? Being explicit about the +overall \emph{setting} and \emph{threat model} is prevalent in formal security +work like cryptography, where an abstract (mathematical) model is used to show +that security can be \emph{proved} by reducing to a computationally hard problem +(like integer factorization) or a property of some primitive (like the collision +resistance of a hash function)~\cite{provsec}. It is nevertheless just as crucial in less +formal work that deals with security of systems in the real (natural) +world---the exclusive setting of the scientific method---which usually lends +itself towards break-and-fix cycles in light of new observations. Where to draw +the line between \emph{security work} and \emph{security research} is not +trivial. However, a few common \emph{failures} of past ``security research'' +include not bringing observations in contact with theory, not making claims and +assumptions explicit, or simply relying on unfalsifiable claims~\cite{sse}. + +While deductive approaches (like formal reduction proofs) are instrumental in +managing complexity and gaining confidence in different models, more than these +approaches are required as a model's \emph{instantiation} must also be secure~\cite{secdefs}. +It is common to complement abstract modeling with \emph{real-world measurements} +as well as \emph{systems prototyping and evaluations}~\cite{rfenr}. Real-world +measurements measure properties of deployed systems like the Internet, the web, +and the Tor network. For example, a hypothesis in a real-world measurement +could be that (non-)Tor users browse according to the same website popularity +distribution. Sometimes these measurements involve the use of research +prototypes, or the research prototypes themselves become the objects of study to +investigate properties of selected system parts (say, whether a packet processor +with new features is indistinguishable from some baseline as active network +attackers adaptively inject packets of their choosing). If it is infeasible, +expensive, or unsafe (see below) to study a real-world system, a simulation may +be studied instead. The downside of simulation is that the model used may not +be a good approximation of the natural world, similar to formal +cryptographic~modeling. + +The appended papers use all of the above approaches to make claims about +security, privacy, and performance in different systems, sometimes with regard +to an abstract model that can be used as a foundation in the natural world to +manage complexity. Paper~\ref{paper:lwm} contains a reduction proof sketch to +show reliance on standard cryptographic assumptions. Paper~\ref{paper:cat} +extends past simulation setups to show the impact of an added attacker +capability. Meanwhile, Paper~\ref{paper:ctor} models part of the Tor network +with mathematical formulas to estimate performance overhead. All but +Paper~\ref{paper:sauteed} contain real-world measurements relating to Internet +infrastructure, websites, certificates, Tor, or practical deployability of our +proposals. All but Paper~\ref{paper:ctor} contain research prototypes with +associated evaluations, e.g., performance profiling, as well as corroborating or +refuting our security definitions in experimental settings. All papers include +discussions of security and privacy properties as well as their limitations and +strengths in the chosen settings (where assumptions are explicit and threat +models motivated). + +Throughout our experiments, we strived to follow best practices like documenting +the used setups, making datasets and associated tooling available, reducing +potential biases by performing repeated measurements from multiple different +vantage points, and discussing potential biases (or lack thereof)~\cite{rfenr}. +We also interacted with Tor's research safety board~\cite{trsb} to discuss the +ethics and safety of our measurements in Paper~\ref{paper:tlwo}, and refrained +from measuring real (i.e., non-synthetic) usage of Tor whenever possible +(Papers~\ref{paper:ctor} and~\ref{paper:cat}). Finally, the uncovered bugs and +vulnerabilities in Papers~\ref{paper:cat}--\ref{paper:tlwo} were responsibly +disclosed to the Tor project. This included suggestions on how to move forward. + +\section{Contributions} \label{sec:contribs} + +The main contributions of this thesis are listed below. An overview of how they +relate to our research questions and appended papers is shown in +Figure~\ref{fig:contrib}. + +\begin{figure}[!t] + \centering + \includegraphics[width=0.83\textwidth]{src/introduction/img/contribs} + \caption{% + Overview of appended papers, contributions, and research questions. + } + \label{fig:contrib} +\end{figure} + +\vspace{1cm} +\begin{contributions} + \item[Reduced trust in third-party monitoring with a signed tree head + extension that shifts trust from non-cryptographic certificate notifications + to a log's gossip-audit model (or if such a model does not exist yet, the + logs themselves).] + + Paper~\ref{paper:lwm} applies existing cryptographic techniques for + constructing static and lexicographically ordered Merkle trees so that + certificates can be wild-card filtered on subject alternative names with + (non-)membership proofs. This building block is evaluated in the context of + Certificate Transparency, including a security sketch and performance + benchmarks. + + \item[Increased probability of split-view detection by proposing gossip + protocols that disseminate signed tree heads without bidirectional + communication.] + + Paper~\ref{paper:ctga} explores aggregation of signed tree heads at line + speed in programmable packet processors, facilitating consistency proof + verification on the level of an entire autonomous system. Such verification + can be indistinguishable from an autonomous system without any split-view + detection to achieve herd immunity, i.e., protection without aggregation. + Aggregation at 32 autonomous systems can protect 30-50\% of the IPv4 space. + Paper~\ref{paper:ctor} explores signed tree heads in Tor's consensus. To + reliably perform an undetected split-view against log clients that have Tor + in their trust root, a log must collude with a majority of directory + authorities. + + \item[Improved detectability of website hijacks targeting Tor Browser by + proposing privacy-preserving and gradual roll-outs of Certificate + Transparency in Tor.] + + Paper~\ref{paper:ctor} explores adoption of Certificate Transparency in Tor + Browser with signed certificate timestamps as a starting point, then + leveraging the decentralized network of relays to cross-log certificates + before ultimately verifying inclusion proofs against a single view in Tor's + consensus. The design is probabilistically secure with tunable parameters + that result in modest overheads. Paper~\ref{paper:sauteed} shows that + Certificate Transparency logging of domain names with associated onion + addresses helps provide forward censorship-resistance and detection of + unwanted onion associations. + + \item[An extension of the attacker model for website fingerprinting that + provides attackers with the capability of querying a website oracle.] + + A website oracle reveals whether a monitored website was (not) visited by + any network user during a specific time frame. Paper~\ref{paper:cat} + defines and simulates website fingerprinting attacks with website oracles, + showing that most false positives can be eliminated for all but the most + frequently visited websites. A dozen sources of real-world website oracles + follow from the protocols used during website visits. We enumerate and + classify those sources based on ease of accessibility, reliability, and + coverage. The overall analysis includes several Internet measurements. + + \item[Remotely-exploitable probing-attacks on Tor's DNS cache that instantiate + a real-world website oracle without any special attacker capabilities or reach.] + + Paper~\ref{paper:cat} shows that timing differences in end-to-end response + times can be measured to determine whether a domain name is (not) cached by + a Tor relay. An estimated true positive rate of 17.3\% can be achieved + while trying to minimize false positives. Paper~\ref{paper:tlwo} improves + the attack by exploiting timeless timing differences that depend on + concurrent processing. The improved attack has no false positives or false + negatives. Our proposed bug fixes and mitigations have been merged in Tor. + + \item[A complete redesign of Tor's DNS cache that defends against all (timeless) + timing attacks while retaining or improving performance compared~to~today.] + + Paper~\ref{paper:tlwo} suggests that Tor's DNS cache should only share the + same preloaded domain names across different circuits to remove the + remotely-probable state that reveals information about past exit traffic. A + network measurement with real-world Tor relays shows which popularity lists + are good approximations of Tor usage and, thus, appropriate to preload. + Cache-hit ratios can be retained or improved compared to today's Tor. +\end{contributions} + +\section{Summary of Appended Papers} \label{sec:appended} + +The appended papers and their contexts are summarized below. Notably, all +papers are in publication-date order except that Paper~\ref{paper:cat} predates +Papers~\ref{paper:ctor}--\ref{paper:sauteed}. + +{ + \hypersetup{linkcolor=black} + \listofsummaries +} + +\section{Related Work} \label{sec:related} + +This section positions the appended papers with regard to related work. For +Certificate Transparency, this includes approaches towards signed certificate +timestamp verification, gossip, and the problem of monitoring the logs. The +related work with regard to Tor is focused on the practicality of website +fingerprinting attacks and prior use of side-channels (such as timing attacks). + +\subsection{Certificate Transparency Verification} + +Approaches that fetch inclusion proofs have in common that they should preserve +privacy by not revealing the link between users and visited websites. +Eskandarian~\emph{et~al.}\ mention that Tor could be used to overcome privacy +concerns; however, it comes at the cost of added infrastructure +requirements~\cite{eskandarian}. Lueks and Goldberg~\cite{lueks} and +Kales~\emph{et~al.}~\cite{kales} suggest that logs could provide inclusion +proofs using multi-server private information retrieval. This requires a +non-collusion assumption while also adding significant overhead. +Laurie suggests that users can fetch inclusion proofs via DNS as their resolvers +already learned the destination sites~\cite{ct-over-dns}. While surveying +signed certificate timestamp auditing, +Meiklejohn~\emph{et~al.}\ point out that Certificate Transparency over DNS may +have privacy limitations~\cite{sok-sct-auditing}. For example, the time of +domain lookups and inclusion proof queries are detached. +Paper~\ref{paper:ctga} uses Laurie's approach as a premise while +proposing a gossip protocol. +Paper~\ref{paper:ctor} applies Certificate Transparency in a context +where Tor is not additional infrastructure~(Tor~Browser). + +Several proposals try to avoid inclusion proof fetching altogether. +Dirksen~\emph{et~al.}\ suggest that all logs could be involved in the issuance of +a signed certificate timestamp~\cite{dirksen}. This makes it harder to violate +maximum merge delays without detection but involves relatively large changes to +log deployments. Nordberg~\emph{et~al.}\ suggest that signed certificate +timestamps can be handed back to the origin web servers on subsequent +revisits~\cite{nordberg}, which has the downside of assuming that +machine-in-the-middle attacks eventually cease for detection. +Nordberg~\emph{et~al.}~\cite{nordberg} as well as Chase and +Meiklejohn~\cite{chase} suggest that some clients/users could collaborate with a +trusted auditor. Stark and Thompson describe how users can opt-in to use Google +as a trusted auditor~\cite{opt-in-sct-auditing}. This approach was recently +replaced by opt-out auditing that cross-checks a fraction of signed +certificate timestamps with Google using +k-anonymity~\cite{opt-out-sct-auditing}. Henzinger~\emph{et al.} show how such +k-anonymity can be replaced with a single-server private information retrieval +setup that approaches the performance of prior multi-server +solutions~\cite{henzinger}. None of the latter two proposals provide a solution +for privately reporting that a log may have violated its maximum merge delay +because the trusted auditor is assumed to know about all signed certificate +timestamps. Eskandarian~\emph{et~al.}\ show how to prove that a log omitted a +certificate privately~\cite{eskandarian}. However, they use an invalid +assumption about today's logs being in strict timestamp +order~\cite{sok-sct-auditing}. Paper~\ref{paper:ctor} suggests that Tor Browser +could submit a fraction of signed certificate timestamps to randomly selected +Tor relays. These relays perform further auditing on Tor Browser's behalf: much +like a trusted auditor, except that no single entity is running it. + +Merkle trees fix log content---not promises of logging. Therefore, +inclusion proof fetching by users or their trusted parties must be accompanied +by consistency verification and gossip to get a complete gossip-audit +model~\cite{rfc6962}. Chuat~\emph{et~al.}\ suggest that users and web servers +can pool signed tree heads, gossiping about them as they interact~\cite{chuat}. +Nordberg~\emph{et~al.}\ similarly suggest that users can pollinate signed tree +heads as they visit different web servers~\cite{nordberg}. Hof and Carle +suggest that signed tree heads could be cross-logged to make all logs +intertwined~\cite{hof}. Gunn~\emph{et~al.}\ suggest multi-path fetching of +signed tree heads~\cite{gunn}, which may make persistent split-views hard +depending on the used multi-paths. Syta~\emph{et~al.}\ suggest that independent +witnesses could cosign the logs using threshold signatures~\cite{syta}. +Smaller-scale versions of witness cosigning received attention in +industry~\cite{sigsum-witness,trustfabric-arxiv}, and generally in other types +of transparency logs as well~\cite{parakeet}. Larger browser vendors could +decide to push the same signed tree heads to their users, as proposed by Sleevi +and Messeri~\cite{sth-push}. Paper~\ref{paper:ctga} uses the operators of +network vantage points for aggregating and verifying signed tree heads to +provide their users with gossip-as-a-service, however assuming plaintext +DNS traffic and a sound signed tree head frequency as defined by +Nordberg~\emph{et~al.}~\cite{nordberg}. We used the multi-path assumptions of +Gunn~\emph{et~al.}\ to break out of local vantage points. In contrast, +Paper~\ref{paper:ctor} ensures that the same logs are observed in the Tor +network by incorporating signed tree heads into Tor's consensus (thus making +directory authorities into witnesses). + +Li~\emph{et~al.}\ argue that it would be too costly for most domains to run a +monitor~\cite{li}.\footnote{% + Whether the third-party monitors in this study misbehaved or not can be + questioned~\cite{ayer-on-li}. +} Similar arguments have been raised before, and lead to alternative data +structures that could make monitoring more efficient than today's +overhead~\cite{vds,coniks,tomescu}. Paper~\ref{paper:lwm} falls into this +category, as the root of an additional static lexicographically-ordered Merkle +tree is added to a log's signed tree heads to encode batches of included +certificates. The downside is that a non-deployed signed tree head extension is +assumed~\cite{rfc9162}, as well as a tree head frequency similar to those +described by Nordberg +\emph{et~al.}~\cite{nordberg}~to~get~efficiency~in~practise. + +Paper~\ref{paper:sauteed} uses a Mozilla Firefox web extension to verify +embedded signed certificate timestamps in Tor Browser. Such verification is +similar to the gradual deployments of Certificate Transparency in other +browsers~\cite{ct-history,does-ct-break-the-web}, and the starting point to +improve upon in Papers~\ref{paper:ctga}--\ref{paper:ctor}. Moreover, the use of +Certificate Transparency to associate human-meaningful domain names with +non-mnemonic onion addresses (as in Paper~\ref{paper:sauteed}) is one of many +proposals for alternative naming systems and onion search +solutions~\cite{kadianakis,muffet-onions,nurmi,onion-location,h-e-securedrop,onio-ns}. + +\subsection{Website Fingerprinting and Side-Channels} + +Several researchers outline how past website fingerprinting attacks have been +evaluated in unrealistic conditions~\cite{juarez14,perryCrit,realistic}. +This includes not accounting for the size of the open-world setting, failing to +keep false positive rates low enough to be useful, assuming that homepages are +browsed one at a time, how to avoid dataset drift, and training classifiers on +synthetic network traces. While some of these challenges were +addressed~\cite{onlinewf,realistic}, the question of how to deal with false +positives remains open. Papers~\ref{paper:cat}--\ref{paper:tlwo} make a +significant dent in this problem by providing evidence that the website +fingerprinting attacker model could be made \emph{stronger} to capture +\emph{realistic real-world capabilities} that eliminate most false positives +around Alexa top-10k and the long tail~of~unpopular~sites. + +Others have evaluated traffic analysis attacks against Tor beyond the website +fingerprinting setting. On one side of the spectrum are end-to-end +correlation/confirmation attacks that typically consider a global passive +attacker that observes all network +traffic~\cite{johnson13,nasr18,oh22,rimmer22}. Such strong attackers are not +within the scope of Tor~\cite{tor}. On the other side of the spectrum are local +attackers that see a small fraction of the network, typically in a position to +observe a user's encrypted entry traffic (Figure~\ref{fig:wf}). Many have +studied those \emph{weak attacks} in lab settings where, e.g., advances in deep +learning improved the accuracy significantly~\cite{wfdef,tiktok,df}. Others +have focused on improved attacks that are \emph{active} in the Tor network from +their own local vantage points~\cite{chakravarty10,mittal11,murdoch05}, which is +similar to the techniques in Papers~\ref{paper:cat}--\ref{paper:tlwo}. +Greschbach~\emph{et~al.}\ show that an attacker who gains access to (or traffic +to~\cite{siby20}) commonly used DNS resolvers like Google's \texttt{8.8.8.8} get +valuable information to improve both end-to-end correlation and website +fingerprinting attacks~\cite{greschbach}. Paper~\ref{paper:cat} generalizes the +attacker capability they uncovered by allowing the attacker to query Tor's +receiver anonymity set with a website oracle of time-frame~$t$. It is further +shown that it is possible to instantiate such an abstraction in the real-world +while \emph{staying within Tor's threat model}. In other words, the attacker is +still local but may employ passive and active measures to narrow down the +receiver anonymity set. Paper~\ref{paper:ctor} proposes Certificate +Transparency verification that gives log operators website oracle access. +Tor's directory authorities tune $t$. + +Website oracles exist because Tor is designed for anonymity---not unobservable +communication~\cite{anonterm}. The instantiation of a real-world website oracle +is either a direct result of observing network flows from the protocols +used during website visits, or due to state of these network flows being stored +and inferable. Inferring secret system state is widely studied in applied +cryptography and hardware +architecture~\cite{lucky13,ge18,kocher96,mart21,tsunoo03,heist}, where the goal +is usually to determine a key, decrypt a ciphertext, forge a message, or similar +using side-channels. A side-channel can be local or remote and ranges from +analysis of power consumption to cache states and timing differences. There is +a long history of remote timing attacks that are +practical~\cite{bbrumley11,dbrumley03,crosby09,wang22}. A recent improvement in +this area that is relevant for Tor is timeless timing attacks, which exploit +concurrency and message reordering to eliminate network jitter~\cite{timeless}. +Paper~\ref{paper:cat} demonstrates a remote timing attack against Tor's DNS +cache that achieves up to 17.3\% true positive rates while minimizing false +positives. Paper~\ref{paper:tlwo} instead uses a remote timeless timing attack +with no false positives, no false negatives, and a small time-frame $t$. This +approaches an ideal website oracle without special attacker capabilities or +reach into third-parties. + +\section{Conclusions and Future Work} \label{sec:concl} + +Throughout the thesis, we contributed to the understanding of how trust +requirements in Certificate Transparency can be reduced. Efficient and reliable +monitoring of the logs is easily overlooked. If the broader ecosystem achieves +monitoring through third-parties, they should be subject to the same scrutiny as +logs. We proposed a solution that makes it hard for third-party monitors to +provide subscribers with selective certificate notifications. We also proposed +a gossip-audit model that plugs into interacting with the logs over DNS by +having programmable packet processors verify that the same append-only logs are +observed. Avoiding the addition of complicated verification logic into end-user +software is likely a long-term win because it reduces the number of moving +parts. In other words, simple gossip-audit models will be much easier to deploy +in the wide variety of end-user software that embeds TLS clients. + +We also contributed to the understanding of how Certificate Transparency can be +applied in the context of Tor Browser. Compared to a regular browser, this +results in a different setting with its own challenges and opportunities. On +the one hand, Tor Browser benefits from the ability to preserve privacy due to +using the anonymity network Tor. On the other hand, data relating to +website visits cannot be persisted to disk (such as signed certificate +timestamps blocked by maximum merge delays). Our incrementally-deployable +proposal keeps the logic in Tor Browser simple by offloading all Certificate +Transparency verification to randomly selected Tor relays. The design is +complete because mis-issued certificates can eventually reach a trusted auditor +who acts on incidents. In addition to proposing Certificate Transparency in Tor +Browser, we also explored how certificates with onion addresses may improve the +association of domain names with onion addresses. Such certificates ensure +domain owners know which onion addresses can be discovered for their sites, much +like Certificate Transparency does the same thing for public TLS keys. This +also adds censorship resistance to the discovery as logs are append-only. + +As part of exploring Certificate Transparency in Tor Browser, we further +contributed to the understanding of how the protocols used during website visits +affect unlinkability between Tor users and their destination websites. For +example, fetching an inclusion proof from a Certificate Transparency log is one +such protocol. We extended the attacker model of website fingerprinting attacks +with website oracles that reveal whether any network user visited a website +during a specific time frame. Our results show that website oracles eliminate +most false positives for all but the most frequently visited websites. In +addition to the theoretical evaluation of the extended attacker model, we could +exploit (timeless) timing attacks in Tor's DNS cache to instantiate real-world +website oracles without any special capabilities or reach into third-parties. +This led us to contribute to the understanding of how Tor's DNS cache performs +today, including a proposal for a performant alternative that preloads the same +popular domains on all Tor relays to withstand all (timeless) timing attacks. + +As an outlook, our angle on Certificate Transparency verification has mostly +been \emph{reactive} for end-users. In other words, some or all certificate +verification occurs asynchronously after a website visit. An alternative to +this would be upfront delivery of inclusion proofs that reconstruct tree heads +which witnesses cosigned; a form of \emph{proactive} gossip as proposed by +Syta~\emph{et al.}~\cite{syta}. The significant upside is that the browser's +verification could become non-interactive, eliminating privacy concerns and +ensuring end-users only see certificates merged into the append-only logs. +Investigating what the blockers for such a proposal are in +practice---today---would be valuable as log verification quickly becomes +complicated with signed certificate timestamps and reactive gossip-audit models. +Are these blockers significant? Are they significant over time as other +\emph{eventual} changes will be needed, like post-quantum secure certificates? +New transparency log applications are unlikely to need the complexity of +Certificate Transparency, and should likely not copy something that was designed +to fit into an existing system with a large amount of legacy (such as +certificate authorities, their established processes for certificate issuance, +and the many client-server implementations already deployed on the Internet). + +Orthogonal to the verification performed by end-users, contributing to the +understanding of how domains (fail to) use Certificate Transparency for +detecting mis-issued certificates is largely unexplored. For example, +subscribing to email notifications of newly issued certificates becomes less +useful in an era where certificates are renewed frequently and automatically. +Instead, domain owners need easy-to-use solutions that raise alarms only if +there is a problem. + +Finally, the mitigation deployed to counter our (timeless) timing attacks in +Tor's DNS cache is just that: a mitigation, not a defense, that applies to +modestly popular websites but not the long tail where the base rate is low. +This is because the attacker's needed website oracle time frame is so large that +a fuzzy time-to-live value does nothing. Practical aspects of a preloaded DNS +cache need to be explored further before deployment, such as the assumption of a +third-party that visits popular domains to assemble an allowlist. We may also +have \emph{underestimated} the utility of the existing Umbrella list, which in +and of itself does not require any new third-party. Does the use of Umbrella +impact page-load latency? Latency is the most crucial parameter to keep +minimized. The question is whether frequently looked-up domains are missed or +not by skipping the website-visit step, as for the non-extended Alexa and Tranco +lists. + +More broadly, the question of how to strike a balance between \emph{efficiency} +and \emph{effectiveness} of website fingerprinting defenses is open. How much +overhead in terms of added latency and/or bandwidth is needed? How much of that +overhead is sustainable, both from a user perspective (where, e.g., latency is +crucial for web browsing and other interactive activities) and a network health +perspective (such as the amount of volunteered relay bandwidth that is wasted)? It is +paramount to neither overestimate nor underestimate attacker capabilities, which +goes back to the still-debated threat model of website fingerprinting attacks. +Regardless of if Tor's DNS cache becomes preloaded or not, it will be difficult +to circumvent DNS lookups from happening. Someone---be it a weak attacker like +ourselves or a recursive DNS resolver at an Internet service provider---is in a +position to narrow down the destination anonymity set. This is especially true +when also considering other protocols that reveal information about the +destination anonymity set during website visits. Accepting that sources of +real-world website oracles are prevalent implies that \emph{the world can be +closed}. Therefore, a closed world is more realistic than an open world. + +\subsection*{Acknowledgments} +I received valuable feedback while writing the introductory summary from + Simone Fischer-H\"{u}bner + Johan Garcia, + Stefan Lindskog, and + Tobias Pulls. +The final draft was further improved with helpful nudges from Grammarly. + +\bibliographystyle{plain} +\bibliography{src/introduction/refs} diff --git a/summary/src/introduction/refs.bib b/summary/src/introduction/refs.bib new file mode 100644 index 0000000..fc31dd8 --- /dev/null +++ b/summary/src/introduction/refs.bib @@ -0,0 +1,954 @@ +%%% +% Certificate transparency +%%% +@techreport{rfc6962, + author = {Ben Laurie and Adam Langley and Emilia Kasper}, + title = {{Certificate Transparency}}, + number = {6962}, + type = {RFC}, + institution = {IETF}, + year = {2013}, + url = {https://tools.ietf.org/html/rfc6962}, +} + +@techreport{rfc9162, + author = {Ben Laurie and Eran Messeri and Rob Stradling}, + title = {{Certificate Transparency} Version 2.0}, + number = {9162}, + type = {RFC}, + institution = {IETF}, + year = {2021}, + url = {https://tools.ietf.org/html/rfc9162}, +} + +@misc{google-log-policy, + author = {{Google LLC.}}, + title = {{Certificate Transparency} in {Chrome}}, + howpublished = {\url{https://googlechrome.github.io/CertificateTransparency/ct_policy.html}, accessed 2023-04-30}, +} + +@misc{apple-log-policy, + author = {{Apple Inc.}}, + title = {Apple's {Certificate Transparency} Policy}, + howpublished = {\url{https://support.apple.com/en-us/HT205280}, accessed 2023-04-30}, +} + +@misc{ct-monitors, + author = {{Google LLC.}}, + title = {The list of existing monitors}, + howpublished = {\url{https://certificate.transparency.dev/monitors/}, accessed 2023-04-30}, +} + +@misc{sslmate-history, + author = {{SSLMate Inc.}}, + title = {Timeline of Certificate Authority Failures}, + howpublished = {\url{https://sslmate.com/resources/certificate_authority_failures}, accessed 2023-04-30}, +} + +@misc{merkle-intro, + author = {Rasmus Dahlberg}, + title = {Transparency log preliminaries}, + howpublished = {\url{https://gitlab.torproject.org/rgdd/ct/-/blob/main/doc/tlog-preliminaries.md}, accessed 2023-04-30}, +} + +@article{ct, + author = {Ben Laurie}, + title = {{Certificate Transparency}}, + journal = {CACM}, + volume = {57}, + number = {10}, + year = {2014}, +} + +@article{ct-history, + author = {Emily Stark and + Joe DeBlasio and + Devon O'Brien and + Davide Balzarotti and + William Enck and + Samuel King and + Angelos Stavrou}, + title = {{Certificate Transparency} in {Google Chrome}: Past, Present, and Future}, + journal = {{IEEE} {S\&P}}, + volume = {19}, + number = {6}, + year = {2021}, +} + +@article{sok-sct-auditing, + author = {Sarah Meiklejohn and + Joe DeBlasio and + Devon O'Brien and + Chris Thompson and + Kevin Yeo and + Emily Stark}, + title = {{SoK}: {SCT} Auditing in {Certificate Transparency}}, + journal = {PETS}, + volume = {2022}, + number = {3}, +} + +@inproceedings{does-ct-break-the-web, + author = {Emily Stark and Ryan Sleevi and Rijad Muminovic and Devon O'Brien and Eran Messeri and Adrienne Porter Felt and Brendan McMillion and Parisa Tabriz}, + title = {Does {Certificate Transparency} Break the Web? {Measuring} Adoption and Error Rate}, + booktitle = {IEEE S\&P}, + year = {2019}, +} + +@inproceedings{ct-formal, + author = {Benjamin Dowling and + Felix G{\"{u}}nther and + Udyani Herath and + Douglas Stebila}, + title = {Secure Logging Schemes and {Certificate Transparency}}, + booktitle = {ESORICS}, + year = {2016}, +} + +@techreport{nordberg, + author = {Linus Nordberg and Daniel Kahn Gillmor and Tom Ritter}, + title = {Gossiping in {CT}}, + number = {draft-ietf-trans-gossip-05}, + type = {Internet-draft}, + institution = {IETF}, + year = {2018}, + url = {https://tools.ietf.org/html/draft-ietf-trans-gossip-05} +} + +@inproceedings{chuat, + author = {Laurent Chuat and Pawel Szalachowski and Adrian Perrig and Ben Laurie and Eran Messeri}, + title = {Efficient Gossip Protocols for Verifying the Consistency of Certificate Logs}, + booktitle = {CNS}, + year = {2015}, +} + +@inproceedings{gunn, + author = {Lachlan J. Gunn and Andrew Allison and Derek Abbott}, + title = {Safety in Numbers: Anonymization Makes Keyservers Trustworthy}, + booktitle = {HotPETs}, + year = {2017}, +} + +@article{hof, + author = {Benjamin Hof and Georg Carle}, + title = {Software Distribution Transparency and Auditability}, + journal = {CoRR}, + volume = {abs/1711.07278}, + year = {2017}, +} + +@inproceedings{syta, + author = {Ewa Syta and Iulia Tamas and Dylan Visher and David Isaac Wolinsky and Philipp Jovanovic and Linus Gasser and Nicolas Gailly and Ismail Khoffi and Bryan Ford}, + title = {Keeping Authorities "Honest or Bust" with Decentralized Witness Cosigning}, + booktitle = {IEEE S\&P}, + year = {2016}, +} + +@article{trustfabric-arxiv, + author = {Sarah Meiklejohn and + Pavel Kalinnikov and + Cindy S. Lin and + Martin Hutchinson and + Gary Belvin and + Mariana Raykova and + Al Cutter}, + title = {Think Global, Act Local: Gossip and Client Audits in Verifiable Data Structures}, + journal = {CoRR}, + volume = {abs/2011.04551}, + year = {2020}, +} + +@misc{sigsum-witness, + author = {Sigsum Project Contributors}, + title = {Witness {API} v0}, + howpublished = {\url{https://git.glasklar.is/sigsum/project/documentation/-/blob/main/witness.md}, accessed 2023-04-30}, +} + +@inproceedings{parakeet, + author = {Harjasleen Malvai and + Lefteris Kokoris{-}Kogias and + Alberto Sonnino and + Esha Ghosh and + Ercan Ozt{\"{u}}rk and + Kevin Lewi and + Sean F. Lawlor}, + title = {Parakeet: Practical Key Transparency for End-to-End Encrypted Messaging}, + booktitle = {{NDSS}}, + year = {2023}, +} + +@article{dirksen, + author = {Alexandra Dirksen and + David Klein and + Robert Michael and + Tilman Stehr and + Konrad Rieck and + Martin Johns}, + title = {{LogPicker}: Strengthening {Certificate Transparency} Against Covert Adversaries}, + journal = {PETS}, + volume = {2021}, + number = {4}, +} + +@misc{ct-over-dns, + author = {Ben Laurie}, + title = {{Certificate Transparency} over {DNS}}, + howpublished = {\url{https://github.com/google/certificate-transparency-rfcs/blob/master/dns/draft-ct-over-dns.md}, accessed 2023-04-30}, +} + +@inproceedings{lueks, + author = {Wouter Lueks and Ian Goldberg}, + title = {Sublinear Scaling for Multi-Client Private Information Retrieval}, + booktitle = {FC}, + year = {2015}, +} + +@inproceedings{kales, + author = {Daniel Kales and Olamide Omolola and Sebastian Ramacher}, + title = {Revisiting User Privacy for {Certificate Transparency}}, + booktitle = {IEEE EuroS\&P}, + year = {2019}, +} + +@inproceedings{henzinger, + author = {Alexandra Henzinger and Matthew M. Hong and Henry Corrigan-Gibbs and Sarah Meiklejohn and Vinod Vaikuntanathan}, + title = {One Server for the Price of Two: Simple and Fast Single-Server Private Information Retrieval}, + booktitle = {{USENIX Security}}, + year = {2023}, +} + +@inproceedings{chase, + author = {Melissa Chase and Sarah Meiklejohn}, + title = {Transparency Overlays and Applications}, + booktitle = {CCS}, + year = {2016}, +} + +@article{eskandarian, + author = {Saba Eskandarian and + Eran Messeri and + Joseph Bonneau and + Dan Boneh}, + title = {{Certificate Transparency} with Privacy}, + journal = {PETS}, + volume = {2017}, + number = {4}, +} + +@misc{opt-in-sct-auditing, + title = {Opt-in {SCT} Auditing}, + author = {Emily Stark and Chris Thompson}, + howpublished = {\url{https://docs.google.com/document/d/1G1Jy8LJgSqJ-B673GnTYIG4b7XRw2ZLtvvSlrqFcl4A/edit}, accessed 2023-04-30}, +} + +@misc{opt-out-sct-auditing, + title = {Opt-out {SCT} Auditing in {Chrome}}, + author = {Joe DeBlasio}, + howpublished = {\url{https://docs.google.com/document/d/16G-Q7iN3kB46GSW5b-sfH5MO3nKSYyEb77YsM7TMZGE/edit}, accessed 2023-04-30}, +} + +@misc{sth-push, + author = {Ryan Sleevi and Eran Messeri}, + title = {{Certificate Transparency} in {Chrome}: Monitoring {CT} Logs consistency}, + howpublished = {\url{https://docs.google.com/document/d/1FP5J5Sfsg0OR9P4YT0q1dM02iavhi8ix1mZlZe_z-ls/edit?pref=2&pli=1}, accessed 2023-04-30}, +} + +@misc{crt.sh, + author = {{Sectigo Limited}}, + title = {{crt.sh}: certificate search}, + howpublished = {\url{https://github.com/crtsh}, accessed 2023-04-30}, +} + +@misc{certspotter, + author = {{SSLMate Inc.}}, + title = {Cert Spotter---{Certificate Transparency} Monitor}, + howpublished = {\url{https://github.com/SSLMate/certspotter}, accessed 2023-04-30}, +} + +@misc{vds, + author = {Adam Eijdenberg and Ben Laurie and Al Cutter}, + title = {Verifiable Data Structures}, + howpublished = {\url{https://github.com/google/trillian/blob/master/docs/papers/VerifiableDataStructures.pdf}, accessed 2023-04-30}, +} + +@inproceedings{coniks, + author = {Marcela S. Melara and + Aaron Blankstein and + Joseph Bonneau and + Edward W. Felten and + Michael J. Freedman}, + title = {{CONIKS:} Bringing Key Transparency to End Users}, + booktitle = {{USENIX} Security}, + year = {2015}, +} + +@inproceedings{tomescu, + author = {Alin Tomescu and + Vivek Bhupatiraju and + Dimitrios Papadopoulos and + Charalampos Papamanthou and + Nikos Triandopoulos and + Srinivas Devadas}, + title = {Transparency Logs via Append-Only Authenticated Dictionaries}, + booktitle = {{CCS}}, + year = {2019}, +} + +@inproceedings{li, + author = {Bingyu Li and + Jingqiang Lin and + Fengjun Li and + Qiongxiao Wang and + Qi Li and + Jiwu Jing and + Congli Wang}, + title = {{Certificate Transparency} in the Wild: Exploring the Reliability of Monitors}, + booktitle = {{CCS}}, + year = {2019}, +} + +@misc{ayer-on-li, + author = {Andrew Ayer}, + title = {Reliability of Monitors | Mitigations}, + howpublished = {\url{https://groups.google.com/a/chromium.org/g/ct-policy/c/zCtQrn_7QK8}, accessed 2023-04-30}, +} + +@misc{cloudflare-scts, + author = {Nick Sullivan}, + title = {Understanding use-cases for {SCTs} delivered via {OCSP} stapling for {TLS} extension}, + howpublished = {\url{https://groups.google.com/a/chromium.org/g/ct-policy/c/WX6iZt7uJBs}, accessed 2023-04-30}, +} + +@misc{izenpe-err, + author = {Ryan Sleevi}, + title = {Upcoming {CT} Log Removal: {Izenpe}}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/qOorKuhL1vA}, accessed 2023-04-30}, +} + +@misc{venafi-err, + author = {Ryan Sleevi}, + title = {Upcoming Log Removal: {Venafi CT} Log Server}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/KMAcNT3asTQ}, accessed 2023-04-30}, +} + +@misc{trustasia-err, + author = {Andrew Ayer}, + title = {{Trust Asia} 2021 has produced inconsistent {STHs}}, + howpublished = {\url{https://groups.google.com/a/chromium.org/g/ct-policy/c/VJaSg717m9g}, accessed 2023-04-30}, +} + +@misc{google-err, + author = {Paul Hadfield}, + title = {Google {Aviator} incident under investigation}, + howpublished = {\url{https://groups.google.com/a/chromium.org/g/ct-policy/c/ZZf3iryLgCo/m/mi-4ViMiCAAJ}, accessed 2023-04-30}, +} + +@misc{starcom-err, + author = {Ryan Sleevi}, + title = {{StartCom} Log misbehaving: Failure to incorporate {SCTs}}, + howpublished = {\url{https://groups.google.com/a/chromium.org/g/ct-policy/c/92HIh2vG6GA/m/hBEHxcpoCgAJ}, accessed 2023-04-30}}, +} + +@misc{wosign-err, + author = {Graham Edgecombe}, + title = {{WoSign} log failure to incorporate entry within the {MMD}}, + howpublished = {\url{https://groups.google.com/a/chromium.org/g/ct-policy/c/-eV4Xe8toVk/m/pC5gSjJKCwAJ}, accessed 2023-04-30}, +} + +@misc{digicert-err, + author = {Andrew Ayer}, + title = {Retiring {DigiCert} Log Server (aka {``CT1''}) in {Chrome}}, + howpublished = {\url{https://groups.google.com/a/chromium.org/g/ct-policy/c/P5aj4JEBFPM/m/9AEcvY01EQAJ}, accessed 2023-04-30}, +} + +@misc{digicert-kc, + title = {{CT2} Log Compromised via {Salt} Vulnerability}, + author = {Jeremy Rowley}, + howpublished = {\url{https://groups.google.com/a/chromium.org/forum/#!topic/ct-policy/aKNbZuJzwfM}, accessed 2023-04-30}, +} + +%%% +% Tor and traffic analysis +%%% +@misc{tpo, + author = {Tor Project}, + title = {Browse Privately. {Explore} Freely. {Defend} yourself against tracking and surveillance. {Circumvent} censorship.}, + howpublished = {\url{https://www.torproject.org/}, accessed 2022-04-30}, +} + +@inproceedings{tor, + author = {Roger Dingledine and Nick Mathewson and Paul F. Syverson}, + title = {Tor: The Second-Generation Onion Router}, + booktitle = {{USENIX Security}}, + year = {2004}, +} + +@misc{tb, + author = {Mike Perry and Erinn Clark and Steven Murdoch and Georg Koppen}, + title = {The Design and Implementation of the {Tor Browser [DRAFT]}}, + howpublished = {\url{https://2019.www.torproject.org/projects/torbrowser/design/}, accessed 2023-04-30}, +} + +@inproceedings{mani, + author = {Akshaya Mani and + T. Wilson{-}Brown and + Rob Jansen and + Aaron Johnson and + Micah Sherr}, + title = {Understanding {Tor} Usage with Privacy-Preserving Measurement}, + booktitle = {{IMC}}, + year = {2018} +} + +@inproceedings{johnson13, + author = {Aaron Johnson and Chris Wacek and Rob Jansen and Micah Sherr and Paul F. Syverson}, + title = {Users get routed: traffic correlation on {Tor} by realistic adversaries}, + booktitle = {{CCS}}, + year = {2013} +} + +@inproceedings{nasr18, + author = {Milad Nasr and Alireza Bahramali and Amir Houmansadr}, + title = {{DeepCorr}: Strong Flow Correlation Attacks on {Tor} Using Deep Learning}, + booktitle = {{CCS}}, + year = {2018} +} + +@article{rimmer22, + author = {Vera Rimmer and + Theodor Schnitzler and + Tom van Goethem and + Abel Rodr{\'{\i}}guez Romero and + Wouter Joosen and + Katharina Kohls}, + title = {Trace Oddity: Methodologies for Data-Driven Traffic Analysis on {Tor}}, + journal = {PETS}, + volume = {2022}, + number = {3}, +} + +@inproceedings{oh22, + author = {Se Eun Oh and + Taiji Yang and + Nate Mathews and + James K. Holland and + Mohammad Saidur Rahman and + Nicholas Hopper and + Matthew Wright}, + title = {{DeepCoFFEA}: Improved Flow Correlation Attacks on {Tor} via Metric Learning and Amplification}, + booktitle = {{IEEE} {S\&P}}, + year = {2022}, +} + +@article{cheng98, + title = {Traffic analysis of {SSL} encrypted web browsing}, + author = {Cheng, Heyning and Avnur, Ron}, + journal = {Project paper, University of Berkeley}, + year = {1998} +} + +@inproceedings{herrmann09, + author = {Dominik Herrmann and Rolf Wendolsky and Hannes Federrath}, + title = {Website fingerprinting: attacking popular privacy enhancing technologies with the multinomial na{\"{\i}}ve-bayes classifier}, + booktitle = {{CCSW}}, + year = {2009} +} + +@inproceedings{hintz02, + author = {Andrew Hintz}, + title = {Fingerprinting Websites Using Traffic Analysis}, + booktitle = {{PETS}}, + year = {2002} +} + +@inproceedings{liberatore06, + author = {Marc Liberatore and Brian Neil Levine}, + title = {Inferring the source of encrypted {HTTP} connections}, + booktitle = {{CCS}}, + year = {2006} +} + +@inproceedings{panchenko11, + author = {Andriy Panchenko and Lukas Niessen and Andreas Zinnen and Thomas Engel}, + title = {Website fingerprinting in onion routing based anonymization networks}, + booktitle = {{WPES}}, + year = {2011} +} + +@inproceedings{sun02, + author = {Qixiang Sun and Daniel R. Simon and Yi{-}Min Wang and Wilf Russell and Venkata N. Padmanabhan and Lili Qiu}, + title = {Statistical Identification of Encrypted Web Browsing Traffic}, + booktitle = {{IEEE S\&P}}, + year = {2002} +} + +@inproceedings{juarez14, + author = {Marc Ju{\'{a}}rez and Sadia Afroz and Gunes Acar and Claudia D{\'{\i}}az and Rachel Greenstadt}, + title = {A Critical Evaluation of Website Fingerprinting Attacks}, + booktitle = {{CCS}}, + year = {2014}, +} + +@misc{perryCrit, + author = {Mike Perry}, + title = {A Critique of Website Traffic Fingerprinting Attacks}, + howpublished = {\url{https://blog.torproject.org/critique-website-traffic-fingerprinting-attacks}, accessed 2023-04-30}, +} + +@article{realistic, + author = {Tao Wang and Ian Goldberg}, + title = {On Realistically Attacking {Tor} with Website Fingerprinting}, + journal = {PETS}, + volume = {2016}, + number = {4}, +} + +@inproceedings{onlinewf, + title={Online Website Fingerprinting: Evaluating Website Fingerprinting Attacks on {Tor} in the Real World}, + author={Cherubin, Giovanni and Jansen, Rob and Troncoso, Carmela}, + booktitle={{USENIX Security}}, + year={2022} +} + +@inproceedings{df, + author = {Payap Sirinam and + Mohsen Imani and + Marc Ju{\'{a}}rez and + Matthew Wright}, + title = {Deep Fingerprinting: Undermining Website Fingerprinting Defenses with + Deep Learning}, + booktitle = {{CCS}}, + year = {2018} +} + +@article{tiktok, + author = {Mohammad Saidur Rahman and + Payap Sirinam and + Nate Mathews and + Kantha Girish Gangadhara and + Matthew Wright}, + title = {{Tik-Tok}: The Utility of Packet Timing in Website Fingerprinting Attacks}, + journal = {{PETS}}, + volume = {2020}, + number = {3}, +} + +@inproceedings{wfdef, + title={{SoK}: A Critical Evaluation of Efficient Website Fingerprinting Defenses}, + author={Mathews, Nate and Holland, James K and Oh, Se Eun and Rahman, Mohammad Saidur and Hopper, Nicholas and Wright, Matthew}, + booktitle = {{IEEE} S{\&}P}, + year={2023} +} + +@inproceedings{spoiled-onions, + author = {Philipp Winter and Richard K{\"{o}}wer and Martin Mulazzani and Markus Huber and Sebastian Schrittwieser and Stefan Lindskog and Edgar R. Weippl}, + title = {Spoiled Onions: Exposing Malicious {Tor} Exit Relays}, + booktitle = {PETS}, + year = {2014}, +} + +@inproceedings{murdoch05, + author = {Steven J. Murdoch and George Danezis}, + title = {Low-Cost Traffic Analysis of {Tor}}, + booktitle = {{IEEE S\&P}}, + year = {2005}, +} + +@inproceedings{chakravarty10, + author = {Sambuddho Chakravarty and Angelos Stavrou and Angelos D. Keromytis}, + title = {Traffic Analysis against Low-Latency Anonymity Networks Using Available Bandwidth Estimation}, + booktitle = {{ESORICS}}, + year = {2010}, +} + +@inproceedings{mittal11, + author = {Prateek Mittal and + Ahmed Khurshid and + Joshua Juen and + Matthew Caesar and + Nikita Borisov}, + title = {Stealthy traffic analysis of low-latency anonymous communication using throughput fingerprinting}, + booktitle = {{CCS}}, + year = {2011}, +} + +@inproceedings{greschbach, + author = {Benjamin Greschbach and + Tobias Pulls and + Laura M. Roberts and + Phillip Winter and + Nick Feamster}, + title = {The Effect of {DNS} on {Tor}'s Anonymity}, + booktitle = {{NDSS}}, + year = {2017}, +} + +@inproceedings{siby20, + author = {Sandra Siby and Marc Ju{\'{a}}rez and Claudia D{\'{\i}}az and Narseo Vallina{-}Rodriguez and Carmela Troncoso}, + title = {Encrypted {DNS} -{\textgreater} Privacy? {A} Traffic Analysis Perspective}, + booktitle = {NDSS}, + year = {2020}, +} + +@misc{anonterm, + title={A terminology for talking about privacy by data minimization: Anonymity, unlinkability, undetectability, unobservability, pseudonymity, and identity management}, + author={Pfitzmann, Andreas and Hansen, Marit}, + publisher={Dresden, Germany}, + year={2010}, +} + +### +# Side-channels +### +@inproceedings{kocher96, + author = {Paul C. Kocher}, + title = {Timing Attacks on Implementations of {Diffie-Hellman}, {RSA}, {DSS}, and Other Systems}, + booktitle = {{CRYPTO}}, + year = {1996}, +} + +@inproceedings{dbrumley03, + author = {David Brumley and Dan Boneh}, + title = {Remote Timing Attacks Are Practical}, + booktitle = {{USENIX} Security}, + year = {2003}, +} + +@inproceedings{tsunoo03, + author = {Yukiyasu Tsunoo and + Teruo Saito and + Tomoyasu Suzaki and + Maki Shigeri and + Hiroshi Miyauchi}, + title = {Cryptanalysis of {DES} Implemented on Computers with Cache}, + booktitle = {{CHES}}, + year = {2003}, +} + +@article{crosby09, + author = {Scott A. Crosby and Dan S. Wallach and Rudolf H. Riedi}, + title = {Opportunities and Limits of Remote Timing Attacks}, + journal = {{ACM} Trans. Inf. Syst. Secur.}, + volume = {12}, + number = {3}, + year = {2009}, +} + +@inproceedings{bbrumley11, + author = {Billy Bob Brumley and Nicola Tuveri}, + title = {Remote Timing Attacks Are Still Practical}, + booktitle = {{ESORICS}}, + year = {2011}, +} + +@article{ge18, + author = {Qian Ge and + Yuval Yarom and + David A. Cock and + Gernot Heiser}, + title = {A survey of microarchitectural timing attacks and countermeasures on contemporary hardware}, + journal = {JCEN}, + volume = {8}, + number = {1}, + year = {2018}, +} + +@inproceedings{mart21, + author = {Macarena C. Mart{\'{\i}}nez{-}Rodr{\'{\i}}guez and + Ignacio M. Delgado{-}Lozano and + Billy Bob Brumley}, + title = {{SoK}: Remote Power Analysis}, + booktitle = {{ARES}}, + year = {2021}, +} + +@inproceedings{lucky13, + author = {Nadhem J. AlFardan and Kenneth G. Paterson}, + title = {Lucky Thirteen: Breaking the {TLS} and {DTLS} Record Protocols}, + booktitle = {{IEEE} {S\&P}}, + year = {2013}, +} + +@inproceedings{heist, + author = {Mathy Vanhoef and Tom Van Goethem}, + title = {{HEIST}: {HTTP} Encrypted Information can be +Stolen through {TCP}-windows}, + booktitle = {Black Hat US Briefings}, + year = {2016}, +} + +@inproceedings{timeless, + author = {Tom van Goethem and Christina P{\"{o}}pper and Wouter Joosen and Mathy Vanhoef}, + title = {Timeless Timing Attacks: Exploiting Concurrency to Leak Secrets over Remote Connections}, + booktitle = {{USENIX} Security}, + year = {2020}, +} + +@inproceedings{wang22, + author = {Yingchen Wang and + Riccardo Paccagnella and + Elizabeth Tang He and + Hovav Shacham and + Christopher W. Fletcher and + David Kohlbrenner}, + title = {Hertzbleed: Turning Power Side-Channel Attacks Into Remote Timing Attacks on x86}, + booktitle = {{USENIX} Security}, + year = {2022}, +} + +%%% +% Research methods +%%% +@inproceedings{sse, + author = {Cormac Herley and Paul C. van Oorschot}, + title = {{SoK}: Science, Security and the Elusive Goal of Security as a Scientific Pursuit}, + booktitle = {{IEEE} {S\&P}}, + year = {2017}, +} + +@inproceedings{smics, + author = {Dodig-Crnkovic, Gordana}, + title = {Scientific methods in computer science}, + booktitle = {Proceedings of the Conference for the Promotion of Research in IT at New Universities and at University Colleges in Sk\"{o}vde, Sweden}, + year = {2002}, +} + +@article{icss, + author = {Denning, Peter J}, + title = {Is computer science science?}, + journal = {CACM}, + volume = {48}, + number = {4}, + year = {2005}, +} + +@article{rfenr, + author = {Vaibhav Bajpai and + Anna Brunstr{\"{o}}m and + Anja Feldmann and + Wolfgang Kellerer and + Aiko Pras and + Henning Schulzrinne and + Georgios Smaragdakis and + Matthias W{\"{a}}hlisch and + Klaus Wehrle}, + title = {The Dagstuhl beginners guide to reproducibility for experimental networking research}, + journal = {CCR}, + volume = {49}, + number = {1}, + year = {2019}, +} + +% "There are several reasons why definitions are important [...]" +% "[...] focusing their efforts on devising attacks that are outside the model" +@article{secdefs, + author = {Neal Koblitz and Alfred Menezes}, + title = {Another look at security definitions}, + journal = {AMC}, + volume = {7}, + number = {1}, + year = {2013}, +} + +% §1.1 gives the background of the first reduction proofs / provable security +@article{provsec, + author = {Neal Koblitz and + Alfred Menezes}, + title = {Another Look at ``Provable Security''}, + journal = {J. Cryptol.}, + volume = {20}, + number = {1}, + year = {2007}, +} + + +%%% +% Naming of onion services +%%% +@misc{onion-location, + author = {Tor Project}, + title = {{Onion-Location}}, + howpublished = {\url{https://community.torproject.org/onion-services/advanced/onion-location/}, accessed 2023-04-30}, +} + +@misc{kadianakis, + author = {George Kadianakis and Yawning Angel and David Goulet}, + title = {A Name System {API} for {Tor} Onion Services}, + howpublished = {\url{https://gitlab.torproject.org/tpo/core/torspec/-/blob/main/proposals/279-naming-layer-api.txt}, accessed 2023-04-30}, +} + +@misc{muffet-onions, + author = {Alec Muffett}, + title = {Real-World Onion Sites}, + howpublished = {\url{https://github.com/alecmuffett/real-world-onion-sites}, accessed 2023-04-30}, +} + +@phdthesis{nurmi, + author = {Nurmi, Juha}, + title = {Understanding the Usage of Anonymous Onion Services}, + year = {2019}, + school = {Tampere University, Finland}, +} + +@Misc{h-e-securedrop, + author = {SecureDrop}, + title = {Getting an Onion Name for Your {SecureDrop}}, + howpublished = {\url{https://securedrop.org/faq/getting-onion-name-your-securedrop/}, accessed 2023-04-30}, +} + +@article{onio-ns, + author = {Jesse Victors and Ming Li and Xinwen Fu}, + title = {The Onion Name System}, + journal = {PETS}, + volume = {2017}, + number = {1}, +} + +%%% +% Other +%%% +@inproceedings{le, + author = {Josh Aas and + Richard Barnes and + Benton Case and + Zakir Durumeric and + Peter Eckersley and + Alan Flores{-}L{\'{o}}pez and + J. Alex Halderman and + Jacob Hoffman{-}Andrews and + James Kasten and + Eric Rescorla and + Seth D. Schoen and + Brad Warren}, + title = {{Let's Encrypt}: An Automated Certificate Authority to Encrypt the Entire Web}, + booktitle = {{CCS}}, + year = {2019}, +} + +@inproceedings{sok-https, + author = {Jeremy Clark and Paul C. van Oorschot}, + title = {{SoK}: {SSL} and {HTTPS:} Revisiting Past Challenges and Evaluating Certificate Trust Model Enhancements}, + booktitle = {{IEEE} {S\&P}}, + year = {2013}, +} + +@inproceedings{browser-ui, + author = {Emanuel von Zezschwitz and Serena Chen and Emily Stark}, + title = {``{It} builds trust with the customers''---Exploring User Perceptions of the Padlock Icon in Browser {UI}}, + booktitle = {{IEEE} SPW}, + year = {2022}, +} + +@article{tls-timeline, + author = {Ralph Holz and + Jens Hiller and + Johanna Amann and + Abbas Razaghpanah and + Thomas Jost and + Narseo Vallina{-}Rodriguez and + Oliver Hohlfeld}, + title = {Tracking the deployment of {TLS} 1.3 on the web: a story of experimentation and centralization}, + journal = {CCR}, + volume = {50}, + number = {3}, + year = {2020}, +} + +@misc{mls, + author = {Nick Sullivan and Sean Turner}, + title = {Messaging Layer Security: Secure and Usable End-to-End Encryption}, + howpublished = {\url{https://www.ietf.org/blog/mls-secure-and-usable-end-to-end-encryption/}, accessed 2023-04-30}, +} + +@inproceedings{wireguard, + author = {Jason A. Donenfeld}, + title = {WireGuard: Next Generation Kernel Network Tunnel}, + booktitle = {{NDSS}}, + year = {2017}, +} + +@techreport{rfc8484, + author = {Paul Hoffman and Patrick McManus}, + title = {{DNS} Queries over {HTTPS} ({DoH})}, + number = {8484}, + type = {RFC}, + institution = {IETF}, + year = {2018}, + howpublished = {https://tools.ietf.org/html/rfc8484}, +} + +@misc{zerodium, + author = {{Zerodium}}, + title = {We pay big bounties}, + howpublished = {\url{https://zerodium.com/}, accessed 2023-04-30}, +} + +@misc{ca/b, + author = {{CA/Browser Forum}}, + title = {Baseline Requirements for the Issuance and Management of Publicly‐Trusted Certificates}, + howpublished = {\url{https://cabforum.org/wp-content/uploads/CA-Browser-Forum-BR-1.8.7.pdf}, accessed 2023-04-30}, +} + +@misc{crt:www.example.com, + author = {{Sectigo Limited}}, + title = {crt.sh: certificate search {ID = '8913351873'}}, + howpublished = {\url{https://crt.sh/?id=8913351873}, accessed 2023-04-30}, +} + +@inproceedings{merkle, + author = {Ralph C. Merkle}, + title = {A Digital Signature Based on a Conventional Encryption Function}, + booktitle = {{CRYPTO}}, + year = {1987}, +} + +@inproceedings{history-trees, + author = {Scott A. Crosby and Dan S. Wallach}, + title = {Efficient Data Structures For Tamper-Evident Logging}, + booktitle = {{USENIX} Security}, + year = {2009}, +} + +@techreport{black-tulip, + author = {Hans Hoogstraaten}, + title = {Black Tulip---Report of the investigation into the {DigiNotar} Certificate Authority breach}, + institution = {Fox-IT}, + year = {2012}, +} + +@inproceedings{bambo-cas, + author = {Henry Birge{-}Lee and + Yixin Sun and + Anne Edmundson and + Jennifer Rexford and + Prateek Mittal}, + title = {Bamboozling Certificate Authorities with {BGP}}, + booktitle = {{USENIX Security}}, + year = {2018}, +} + +@article{rtb, + author = {Jun Wang and + Weinan Zhang and + Shuai Yuan}, + title = {Display Advertising with Real-Time Bidding {(RTB)} and Behavioural + Targeting}, + journal = {Foundations and Trends in Information Retrieval}, + year = {2017} +} + +@techreport{ocsp, + author = {Santesson, Stefan and Myers, Michael and Ankney, Rich and Malpani, Ambarish and Galperin, Slava and Adams, Carlisle}, + title = {X.509 {Internet} Public Key Infrastructure Online Certificate Status Protocol---{OCSP}}, + number = {6960}, + type = {RFC}, + institution = {IETF}, + year = {2013}, + url = {https://tools.ietf.org/html/rfc2560}, +} + +@misc{trsb, + author = {Tor Project}, + title = {Research Safety Board}, + howpublished = {\url{https://research.torproject.org/safetyboard/}, accessed 2023-04-30}, +} diff --git a/summary/src/lwm/.gitignore b/summary/src/lwm/.gitignore new file mode 100644 index 0000000..8bb88c8 --- /dev/null +++ b/summary/src/lwm/.gitignore @@ -0,0 +1,9 @@ +main.pdf +*.blg +*.bbl +*.fls +*.fdb_latexmk +*.log +*.out +*.aux +*.swp diff --git a/summary/src/lwm/img/mt.tex b/summary/src/lwm/img/mt.tex new file mode 100644 index 0000000..a62b333 --- /dev/null +++ b/summary/src/lwm/img/mt.tex @@ -0,0 +1,28 @@ +\begin{tikzpicture}[ + sibling distance=32pt, + -latex, + apnode/.style = { + draw=black, + dashed, + }, + ap/.style = { + draw=black, + dashed, + }, +] + \Tree [ + .$r\gets\hash(h_{ab}\concat h_{cd})$ [ + .\node[apnode]{$h_{ab}\gets\hash(h_a\concat h_b)$}; [ + .$h_a\gets\hash(a)$ + ] [ + .$h_b\gets\hash(b)$ + ] + ] \edge[ap]; [ + .$h_{cd}\gets\hash(h_c\concat h_d)$ [ + .\node[apnode]{$h_c\gets\hash(c)$}; + ] \edge[ap]; [ + .$h_d\gets\hash(d)$ + ] + ] + ] +\end{tikzpicture} diff --git a/summary/src/lwm/img/overview.tex b/summary/src/lwm/img/overview.tex new file mode 100644 index 0000000..9f3a9d0 --- /dev/null +++ b/summary/src/lwm/img/overview.tex @@ -0,0 +1,75 @@ +\scalebox{0.9}{\begin{tikzpicture}[ + -latex, + rrs/.style = { + draw = gray!30, + thick, + rounded rectangle, + fill = white, + minimum width = 2cm, + minimum height = 0.7cm, + font = \fontsize{10}{10}\selectfont, + text = white, + }, + ls/.style = { + font=\fontsize{9}{8}\selectfont, + }, +] +\draw (0, 1) node[rrs, fill=rgddTeal] (Log) {Log}; +\draw (0, -1) node[rrs, fill=rgddLime] (Subject) {Subject}; +\draw (3.5, 0) node[rrs, fill=rgddPurple] (Endpoint) {Notifier}; +\draw (-3.5, 0) node[rrs, fill=rgddRed] (Monitor) {Monitor}; + + +\path [draw, ->, rounded corners] + (Log.north) |- + ($ (Log.north) + (Log.west) - (Log) + (-0.25, 0.25) $) + node[ls, above, pos=0.75]{ + STH with snapshot extension + } |- + (Log.west); + +\path [draw, ->, rounded corners] + (Monitor.south) |- + ($ (Monitor.south) + (Monitor.west) - (Monitor) + (-0.25, -0.25) $) + node[ls, below, pos=0.75]{ + verify STH extension + } |- + (Monitor.west); + +\path [draw, ->, rounded corners] + (Subject.south) |- + ($ (Subject.south) + (Subject.east) - (Subject) + (0.25, -0.25) $) + node[ls, below, pos=0.75]{ + verify notification + } |- + (Subject.east); + +\path [draw, <-, dashed, rounded corners] + (Endpoint.north) |- + ($ (Endpoint.east) + (Endpoint.north) - (Endpoint) + (0.25, 0.25) $) + node[ls, above, pos=0.75]{ + optional verify + } |- + (Endpoint.east); + +\draw [->] + (Log.south east) -- + node[ls, sloped, anchor=center, above]{% + batch, STH + } + (Endpoint.north west); + +\draw [->] + (Endpoint.south west) -- + node[ls, sloped, anchor=center, above]{% + notification + } + (Subject.north east); + +\path [draw, ->] + (Log.south west) -- + node[ls, sloped, pos=.59, above]{% + batch, STH + } + (Monitor.north east); +\end{tikzpicture}} diff --git a/summary/src/lwm/img/proofcom.pdf b/summary/src/lwm/img/proofcom.pdf new file mode 100644 index 0000000..473d817 Binary files /dev/null and b/summary/src/lwm/img/proofcom.pdf differ diff --git a/summary/src/lwm/img/proofgen.pdf b/summary/src/lwm/img/proofgen.pdf new file mode 100644 index 0000000..deb7ca4 Binary files /dev/null and b/summary/src/lwm/img/proofgen.pdf differ diff --git a/summary/src/lwm/img/proofvf.pdf b/summary/src/lwm/img/proofvf.pdf new file mode 100644 index 0000000..a2db9d1 Binary files /dev/null and b/summary/src/lwm/img/proofvf.pdf differ diff --git a/summary/src/lwm/img/snapshot.pdf b/summary/src/lwm/img/snapshot.pdf new file mode 100644 index 0000000..df185f6 Binary files /dev/null and b/summary/src/lwm/img/snapshot.pdf differ diff --git a/summary/src/lwm/img/wildcard.tex b/summary/src/lwm/img/wildcard.tex new file mode 100644 index 0000000..73f4262 --- /dev/null +++ b/summary/src/lwm/img/wildcard.tex @@ -0,0 +1,22 @@ +\begin{tikzpicture}[ + sibling distance=6pt, + level distance=100pt, + -latex, + grow=left, +] + \Tree [ + .$r\gets\hash(h_{01}\concat h_{23})$ [ + .$h_{01}\gets\hash(h_0\concat h_1)$ [ + .$h_0\gets\hash(\mathsf{gro.elpmaxe})$ + ] [ + .$h_1\gets\hash(\mathsf{moc.elpmaxe})$ + ] + ] [ + .$h_{23}\gets\hash(h_2\concat h_3)$ [ + .$h_2\gets\hash(\mathsf{moc.elpmaxe.bus})$ + ] [ + .$h_3\gets\hash(\mathsf{ten.elpmaxe})$ + ] + ] + ] +\end{tikzpicture} diff --git a/summary/src/lwm/main.tex b/summary/src/lwm/main.tex new file mode 100644 index 0000000..e6951b4 --- /dev/null +++ b/summary/src/lwm/main.tex @@ -0,0 +1,54 @@ +\begin{kaupaper}[ + author={% + \textbf{Rasmus Dahlberg} and Tobias Pulls + }, + title={% + Verifiable Light-Weight Monitoring for Certificate Transparency Logs + }, + reference={% + NordSec (2018) + }, + summary={% + An often overlooked part of Certificate Transparency is that domain owners + are expected to inspect the logs for mis-issued certificates continuously. + The cost and required expertise to do so have led to the emergence of + third-party monitoring services that notify domain owners of newly issued + certificates that they subscribe to. For example, one may subscribe to + email notifications whenever a certificate is issued for + \texttt{*.example.com}. One downside of such third-party monitoring is + that these notification services become trusted parties with little or no + accountability with regard to omitted certificate notifications. We show + how to add this accountability and tie it to the gossip-audit model + employed by the Certificate Transparency ecosystem by proposing + verifiable light-weight monitoring. The idea is for logs to batch + appended certificates into an additional data structure that + supports \emph{wild-card (non-)membership proofs}. As a result, + third-party monitors can prove cryptographically that they did not omit + any certificate notifications selectively. Our experimental performance + evaluation shows that overhead can be tuned to be small for all involved + parts. + }, + participation={\vspace{-0.75cm} + I had the initial idea and conducted most of the work myself. Tobias + mainly contributed with discussions that lead to the final design. + }, + label={ + paper:lwm + }, +] + \maketitle + \begin{abstract} + \input{src/lwm/src/abstract} + \end{abstract} + + \input{src/lwm/src/introduction} + \input{src/lwm/src/background} + \input{src/lwm/src/lwm} + \input{src/lwm/src/evaluation} + \input{src/lwm/src/conclusion} + + \input{src/lwm/src/acknowledgments} + + \bibliographystyle{plain} + \bibliography{src/lwm/src/references} +\end{kaupaper} diff --git a/summary/src/lwm/src/abstract.tex b/summary/src/lwm/src/abstract.tex new file mode 100644 index 0000000..f5a68b7 --- /dev/null +++ b/summary/src/lwm/src/abstract.tex @@ -0,0 +1,21 @@ +\noindent +Trust in publicly verifiable Certificate Transparency (CT) logs is reduced +through + cryptography, + gossip, + auditing, and + monitoring. +The role of a monitor is to observe each and every log entry, looking for +suspicious certificates that interest the entity running the monitor. +While anyone can run a monitor, it requires + continuous operation and + copies of the logs to be inspected. +This has lead to the emergence of monitoring as-a-service: + a trusted third-party runs the monitor and provides registered subjects with + selective certificate notifications. +We present a CT/bis extension for verifiable \emph{light-weight monitoring} that +enables subjects to verify the correctness of such certificate notifications, +making it easier to distribute and reduce the trust which is otherwise placed in +these monitors. Our extension +supports verifiable monitoring of wild-card domains and piggybacks on CT's +existing gossip-audit security model. diff --git a/summary/src/lwm/src/acknowledgments.tex b/summary/src/lwm/src/acknowledgments.tex new file mode 100644 index 0000000..b3e7d56 --- /dev/null +++ b/summary/src/lwm/src/acknowledgments.tex @@ -0,0 +1,3 @@ +\section*{Acknowledgments} +We would like to thank Linus Nordberg for valuable feedback. This research was +funded by the Swedish Knowledge Foundation as part of the HITS research profile. diff --git a/summary/src/lwm/src/background.tex b/summary/src/lwm/src/background.tex new file mode 100644 index 0000000..bac403b --- /dev/null +++ b/summary/src/lwm/src/background.tex @@ -0,0 +1,119 @@ +\section{Background} \label{lwm:sec:background} +Suppose that a trusted content provider would like to outsource its operation to +an untrusted third-party. This is often referred to as the three-party setting, +in which a trusted source maintains an authenticated data structure through a +responder that answers client queries on the source's behalf~\cite{ads}. +The data structure is authenticated in the sense that every answer is +accompanied by a cryptographic proof that can be verified for correctness by +only trusting the source. +While there are many settings and flavors of authenticated data +structures~\cite{history-tree,pad,accumulator}, our scope is narrowed down to CT +which builds upon Merkle trees. + +\subsection{Merkle Trees} \label{lwm:sec:background:mt} +The seminal work by Merkle~\cite{mt} proposed a \emph{static} binary tree where +each leaf stores the hash of a value and every interior node hashes its children + (Figure~\ref{lwm:fig:mt}). +The root hash serves as a succinct snapshot of the tree's structure and content, +and by revealing a logarithmic number of hashes it can be reconstructed to prove +whether a value is stored in a leaf. These hashes compose an audit path for +a value, and it is obtained by taking every sibling hash while traversing the +tree from the root down towards the leaf being authenticated. An audit path is +verified by reversing the traversal used during generation, first reconstructing +the leaf hash and then every interior node recursively + (using the provided sibling hashes) +until finally reaching the root. Given a collision resistant hash function, +an audit path proves that a given leaf contains a value iff the reconstructed +root hash is known to be authentic. For example, the trusted source might sign +it. +\begin{figure} + \centering + \input{src/lwm/img/mt.tex} + \caption{% + Merkle tree containing four values $a$--$d$. The dashed arrows show the + traversal used to generate an audit path for the right-most leaf (dashed + nodes). + } + \label{lwm:fig:mt} +\end{figure} + +While non-membership of a value can be proven by providing the entire data +structure, this is generally too inefficient since it requires linear space and +time. A better approach is to structure the tree such that the node which should +contain a value is known if it exists. This property is often discussed in +relation to certificate revocation: + as opposed to downloading a list of serial numbers that represent the set of + revoked certificates, + each leaf in a static Merkle tree could (for example) contain an interval + $[a, b)$ where $a$ is revoked and the open interval $(a,b)$ + current~\cite{crt}. +Given a serial number $x$, an audit path can be generated in logarithmic space +and time for the leaf where $x \in [a,b)$ to prove (non-)membership. Similar +constructions that are \emph{dynamic} support updates more +efficiently~\cite{pad,vds,coniks}. + +\subsection{Certificate Transparency} \label{lwm:sec:bac:ct} +The CA ecosystem involves hundreds of trusted third-parties that issue TLS +certificates~\cite{ca-ecosystem}. Once in a while \emph{somebody} gets this process +wrong, and as a result a fraudulent identity-to-key binding may be issued for +\emph{any} subject~\cite{enisa}. +It is important to detect such incidents because mis-issued certificates can +be used to intercept TLS connections. However, detection is hard unless the +subjects \emph{who can distinguish between anything benign and fraudulent} +get a concise view of the certificates that are being served to the clients. +By requiring that every CA-issued certificate must be disclosed in a public +and append-only log, CT layers on-top of the error-prone CA ecosystem to provide +such a view: + in theory anyone can inspect a log and determine for herself if a certificate + is mis-issued~\cite{ct}. + +It would be counter-intuitive to `solve' blind trust in CAs by suggesting that +everybody should trust a log. Therefore, CT is designed such that the log can +be distrusted based on two components: + a dynamic append-only Merkle tree that supports verifiable membership and + consistency queries~\cite{history-tree}, as well as + a gossip protocol that detects split-views~\cite{sthnp,ietf-gossip}. +We already introduced the principles of membership proofs in +Section~\ref{lwm:sec:background:mt}, and consistency proofs are similar in that +a logarithmic number of hashes are revealed to prove two snapshots consistent. +In other words, anyone can verify that a certificate is included in the log +without fully downloading it, and whatever was in the log before still remains +unmodified. Unlike the three-party setting, gossip is needed because there is no +trusted source that signs-off the authenticated data structure: + consistency and inclusion proofs have limited value if everybody observes + different (but valid) versions of the log. + +\subsubsection*{Terminology, Policy Parameters, and Status Quo} +A new STH---recall that this is short for Signed Tree Head---is issued by the +log at least every Maximum Merge Delay (MMD) and no faster than allowed by an +STH frequency~\cite{ct/bis}. An MMD is the +longest time until a certificate must be included in the log after promising to +include it. This promise is referred to as a Signed Certificate Timestamp (SCT). +An STH frequency is relative to the MMD, and limits the number of STHs that can +be issued. These parameters (among others) are defined in a log's policy, and if +a violation is detected there are non-repudiable proofs of log misbehavior that +can be presented. For example, show + an SCT that is not included after an MMD, + too many STHs during the period of an MMD, or + two STHs that are part of two inconsistent versions of the log. +In other words, rather than being a trusted source a log signs statements to be +held accountable. + +Ideally we would have all of these components in place at once: anyone that +interacts with a log audits it for correctness based on partial information + (SCTs, STHs, served certificates, and proofs), +subjects monitor the logs for newly included certificates to check that they are +free from mis-issuance (full download), and a gossip protocol detects or deters +logs from presenting split-views. This is not the case in practice, mainly +because CT is being deployed incrementally~\cite{sth-push} +but also because the cost and complexity of self-monitoring is relatively high. +For example, +a subject that wants rapid detection of mis-issuance needs continuous operation +and full downloads of the logs. It appears that the barrier towards self-% +monitoring have lead to the emergence of monitoring as-a-service, where a +trusted third-party monitors the logs on a subject's behalf by selectively +notifying her of relevant certificates, e.g., mail the operator of +$\mathsf{example.com}$ if $\mathsf{*.example.com}$ certificates are ever found. +Third-party monitoring is convenient for logs too because it reduces the +bandwidth required to serve many subjects. However, for CT it is an unintuitive +concept given that it requires blind trust. diff --git a/summary/src/lwm/src/conclusion.tex b/summary/src/lwm/src/conclusion.tex new file mode 100644 index 0000000..e071935 --- /dev/null +++ b/summary/src/lwm/src/conclusion.tex @@ -0,0 +1,15 @@ +\section{Conclusion} \label{lwm:sec:conclusion} +We proposed a backwards-compatible CT/bis extension that enables light-weight +monitoring (in short LWM). At the cost of a few hundred Kb per day, a subject +can either self-monitor or subscribe to verifiable certificate notifications for +a dozen of logs via an untrusted notifier. The security of LWM piggybacks on the +gossip-audit model of CT, and it relies only on the existence of at least one +honest monitor that verifies our extension. The cost of a compliant log is +overhead during the tree head construction, and this overhead is insignificant +in comparison to a log's STH frequency. A notifier can generate verifiable +certificate notifications---even for wild-card queries for all domains under a +top-level domain---in the order of milliseconds on a single core. Given an +STH frequency of one hour and 288~M LWM subjects, the incurred bandwidth +overhead is roughly 640~Mbps for proofs. As such, a log could easily be its +own notifier on a 1~Gbps connection. Further, any willing third-party could +notify for a dozen of logs on a 10~Gbps connection. diff --git a/summary/src/lwm/src/evaluation.tex b/summary/src/lwm/src/evaluation.tex new file mode 100644 index 0000000..d9d508d --- /dev/null +++ b/summary/src/lwm/src/evaluation.tex @@ -0,0 +1,186 @@ +\section{Evaluation} \label{lwm:sec:evaluation} +First we discuss assumptions and sketch on relevant security properties for LWM. +Next, we examine performance properties of our open-source proof-of-concept +implementation experimentally and reason about bandwidth overhead in theory. +Finally, we present differences and similarities between LWM and related work. + +\subsection{Assumptions and Security Notions} \label{lwm:sec:eval:security} +The primary threat is a computationally bound attacker that attempts to forge or omit a +certificate notification without being detected. +We rely on standard cryptographic assumptions, namely an unforgeable digital +signature scheme and a collision resistant hash function $\hash$ with +$2\secpar$-bit output for a security parameter~$\secpar$. +The former means that an LWM snapshot must originate from the (untrusted) log in +question. While an incorrect snapshot could be created intentionally to hide a +mis-issued certificate, it would be detected if at least one honest monitor +exists because our STH extension piggybacks on the gossip-audit model of CT +(that we assume is secure).\footnote{% + Suppose that witness cosigning is used~\cite{cosi}. Then we rely on at least + one witness to verify our extension. Or, suppose that STH pollination is + used~\cite{ietf-gossip}. Then we rely on the most recent window of STHs to + reach a monitor that verifies our extension. +} +A subject can further detect missing notifications by checking the STH index for +monotonic increases and the STH timestamp for freshness. Thus, given secure +audit paths and correct verification checks as described in +Section~\ref{lwm:sec:lwm:wildcard}, no certificate notification can be forged or +omitted. Our cryptographic assumptions ensure that every leaf is fixed by a +secure audit path as in CT, i.e., a leaf hash with value $v$ is encoded as +$\hash(0x00 \concat v$) and an interior hash with children $L,R$ as +$\hash(0x01 \concat L \concat R)$~\cite{history-tree,ct}. To exclude any +unnecessary data on the ends of a range, the value $v$ is a subject name +concatenated with a hashed list of associated certificates in LWM (subject +names suffice to verify $\Omega$~order). + +CT makes no attempt to offer security in the multi-instance setting~\cite{katz}. +Here, an attacker that targets many different Merkle trees in parallel should +gain no advantage while trying to forge \emph{any} valid (non-)membership +proof. By design there will be many different wild-card Merkle trees in LWM, and +so the (strictly stronger) multi-instance setting is reasonable. We can +provide full bit-security in this setting by ensuring that no node's pre-image +is valid across different trees by incorporating a unique tree-wide +constant $c_t$ in leaf and empty hashes \emph{per batch}, e.g., +$c_t \sample \set{0,1}^\secpar$. Melera \emph{et~al.}~\cite{coniks} +describe this in detail while also ensuring that no node's pre-image is valid +across different locations within a Merkle tree. + +In an ecosystem where CT is being deployed incrementally without gossip, the +benefit of LWM is that a subject who subscribes for certificate notifications +can trust the log only (as opposed to \emph{also} trusting the notifier). +Therefore, today's trust in third-party monitoring services can be reduced +significantly. A log must also present a split-view or an invalid snapshot to +deceive a subject with false notifications. As such, subjects accumulate binding +evidence of log misbehavior that can be audited sometime in the future if +suspicion towards a log is raised. Long-term the benefit of LWM is that it is +easier to distribute the trust which is placed in third-party monitors, i.e., +anyone who processes a (small in comparison to the entire log) batch of +certificates can full-audit it without being a notifier. + +\subsection{Implementation and Performance} \label{lwm:sec:eval:perf} +We implemented multi-instance secure LWM in less than 400 lines of +Go~\cite{artifact}. +Our wild-card structure uses an existing implementation of a radix tree to find +leaf indices and data. To minimize proof-generation times, all hashes are +cached in an in-memory Merkle tree which uses SHA-256. We benchmarked snapshot +creation, proof generation, and proof verification times on a single core as the +batch size increases from 1024--689,245~certificates using + Go's built-in benchmarking tool, + an Intel(R) Core(TM) i5-2500 CPU @ 3.30GHz, and + 2x8 Gb DDR3 RAM. +We assumed real subject names from Alexa's top-1M~\cite{alexa}. and +average-sized certificates of 1500~bytes~\cite{cert-size}, where a batch of $n$ +subject names refers to the $n$ most popular domains. Notably 689,245 +certificates is the largest batch observed by us in Google's Icarus log between +2017-01-25 and 2018-08-05, corresponding to an STH interarrival time of +27.1~hours. The median (average) batch size and STH interarrival time were 22818 +(23751) certificates and 60.1 (61.6) minutes. Only two batches were larger than +132077 certificates. Considering that Icarus is one of the logs that see largest +loads~\cite{log-growth}, we can make non-optimistic conclusions regarding the +performance overhead of LWM without inspecting other logs. + +Figure~\ref{lwm:fig:snapshot} shows snapshot creation time as a function of batch +size. Nearby the median ($2^{15}$) it takes 0.39~seconds to create a +snapshot from scratch, initializing state from an unordered dictionary and +caching all hashes for the first time. For the largest batch, the snapshot +creation time is roughly 10 seconds. Arguably this overhead is still +insignificant for logs, monitors, and notifiers because the associated STH +interarrival times are orders of magnitude larger. +\begin{figure}[!t] + \centering + \includegraphics[width=.9\columnwidth]{src/lwm/img/snapshot.pdf} + \caption{% + Snapshot creation time as a function of batch size. + } + \label{lwm:fig:snapshot} +\end{figure} + +\begin{figure}[!t] + \centering + \includegraphics[width=.9\columnwidth]{src/lwm/img/proofgen.pdf} + \caption{% + Membership and non-membership proof query time as a function of batch + size for a single and no match, respectively. + } + \label{lwm:fig:proofgen} +\end{figure} + +Figure~\ref{lwm:fig:proofgen} shows proof generation time as a function of batch size +while querying for the longest wild-card prefix with a single match +(membership), as well as another wild-card prefix without any match in com's +top-level domain (non-membership). +There is little or no difference between the generation time for these types +of wild-card proofs, and nearby the median it takes +around 7~$\mu s$. For the largest batch, this increased to $12.5$~$\mu s$. +A notifier can thus generate 288 million non-membership +notifications per hour \emph{on a single core}. Verification is also in the +order of $\mu s$, which should be negligible for a subject +(see Figure~\ref{lwm:fig:proofvf}). + +\begin{figure}[!t] + \centering + \includegraphics[width=.9\columnwidth]{src/lwm/img/proofvf.pdf} + \caption{% + Membership and non-membership verification time as a function of batch + size for a single and no match, respectively. + } + \label{lwm:fig:proofvf} +\end{figure} + +To evaluate the cost of generating and verifying a wild-card notification with +a large number of matches, we queried for com's entire top-level domain + (see Figure~\ref{lwm:fig:proofcom}). +In the largest batch where there are 352,383 matches, the proof +generation time is still relatively low: 134~ms. This corresponds to 28.9k +notifications per hour on a single core. The verification time is much larger: +3.5~seconds. This is expected since verification involves reconstructing the +root from all the matching leaves, which is at least as costly as creating a +snapshot of the same size + (cf.\ $2^{18}$ in Figure~\ref{lwm:fig:snapshot}). +While these are relevant performance numbers, anyone who is interested in a +top-level domain would likely just download the entire batch. +\begin{figure}[!t] + \centering + \includegraphics[width=.9\columnwidth]{src/lwm/img/proofcom.pdf} + \caption{% + Membership query and verification time for $\mathsf{*.com}$. + } + \label{lwm:fig:proofcom} +\end{figure} + +Finally, the space \emph{overhead} of a verifiable wild-card notification is +dominated by the two audit paths that enclose the matching subject names. Given +that an audit path contains at most $\ceil{\log_2 n}$ sibling hashes for a batch +of size $n$, the median overhead is roughly one Kb per STH, log, and LWM +subject. Viewed from the perspective of a self-monitor, this is a significant +bandwidth improvement: + as opposed to downloading the median batch of 32.6~Mb, + one Kb and any matching certificate(s) suffice. +In the case of multiple logs, the bandwidth improvement is even greater. For +the notifier we already established that it is relatively cheap to generate new +notifications. Namely, in the single-core case of 288~M notifications per hour +the bandwidth overhead would be 640~Mbs (i.e., all proofs must be distributed +before the next STH is issued). A notifier can thus notify for a dozen of logs +and a significant amount of LWM subjects without running into any CPU or +bandwidth restrictions. Notably this is under the assumption of a sound STH +frequency---one hour in our evaluation, as used by Icarus and many other logs. + +\subsection{Related Work} \label{lwm:sec:eval:related} +Earlier work related to transparent certificate and key management often use +dynamic authenticated dictionaries~\cite{pad,accumulator,vds,aki}. +CONIKS maps a user's mail address to her public key in a binary Merkle prefix +tree, and after each update a client self-monitors her own key-binding by +fetching an exact-match (non-)membership proof~\cite{coniks}. While our work +is conceptually similar to CONIKS since a subject receives one (non-)membership +proof per log update, the main difference is that LWM builds a new Merkle tree +for each update in which wild-card queries are supported. This idea is +inapplicable for CONIKS because a user is potentially interested in the public +key of any mail address (hence the ability to query the entire data structure +on an exact-match). +CONIKS is similarly inapplicable for self-monitors in CT because a subject cares +about \emph{wild-card queries} and \emph{new certificates}. +Without the need for wild-cards, any authenticated dictionary could be used as +a batch building block to instantiate LWM. +While a radix tree viewed as a Merkle tree could support efficient wild-card +proofs~\cite{patricia}, it is more complex than necessary. Therefore, we built +upon the work of Kocher~\cite{crt} and Nuckolls~\cite{range-mt} with a twist on +how to group the data for a new use-case: LWM. diff --git a/summary/src/lwm/src/introduction.tex b/summary/src/lwm/src/introduction.tex new file mode 100644 index 0000000..fce2bcf --- /dev/null +++ b/summary/src/lwm/src/introduction.tex @@ -0,0 +1,76 @@ +\section{Introduction} +Certificate Transparency (CT)~\cite{ct} is an experimental standard that +enhances the public-key infrastructure by adding transparency for certificates +that are issued by Certificate Authorities (CAs). The idea is to mandate +that every certificate must be publicly logged in an append-only tamper-evident +data structure~\cite{history-tree}, such that anyone can observe what has been +issued for whom. This means that a subject can determine for herself if anything +is mis-issued by downloading all certificates; so called \emph{self-monitoring}. +An alternative monitoring approach is to rely on a trusted third-party that +\emph{notifies} the subject if relevant certificates are ever found. Given that +self-monitoring involves set-up, continuous operation, and exhaustive +communication effort, the concept of subscribing for monitoring +\emph{as-a-service} is simpler for the subject. This model is already prevalent +in the wild, and is provided both by CAs and industry vendors---see for example +SSLMate's Cert Spotter~\cite{certspotter} or Facebook's monitoring +tool~\cite{fbmon}. Third-party monitors can also offer related services, such +as searching for certificates interactively or inspecting other log properties. +The former is provided by Facebook and Comodo's \texttt{crt.sh}; the latter by +Graham Edgecombe's CT monitoring tool~\cite{grahmon}. + +It would be an unfortunate short-coming if CT did not change the status quo of +centralized trust by forcing subjects who cannot operate a self-monitor to trust +certificate notifications that are provided by a third-party monitor. While it +is true that a subject could subscribe to a large number of monitors to reduce +this trust, it is overall cumbersome and does not scale well beyond a handful +of notifying monitors (should they exist). To this end, we suggest a CT/bis +extension for verifiable Light-Weight Monitoring (LWM) that makes it easier to +distribute the trust which is otherwise placed in these monitors by decoupling +the notifier from the full-audit function of inspecting all certificates. Our +idea is best described in terms of a self-monitor that polls for new updates, +but as opposed to processing all certificates we can filter on wild-card +prefixes such as \texttt{*.example.com} in a verifiable manner. +LWM relies on the ability to define a new Signed Tree Head (STH) extension, and +thus a CT/bis compliant log is necessary~\cite{ct/bis}. At the time of writing +CT/bis have yet to be published as an IETF standard. We are not aware of any log +that deploys a drafted version. + +As a brief overview, each batch of newly included certificates are grouped as a +static Merkle tree in LWM. The resulting snapshot (also know as a fingerprint or +a root hash) is then incorporated into the corresponding STH as an extension. +An LWM subject receives one verifiable certificate notification per log update +from an untrusted \emph{notifier} + (who could be the log, a monitor, or anyone else), +and this notification is based on the smaller static Merkle tree rather than the +complete log. This is because monitoring as-a-service is mainly about +identifying newly included certificates. Moreover, we can order each static +Merkle tree so that verifiable wild-card filtering is possible. For security we +rely on at least one entity to verify that each snapshot is correct---which is +a general monitoring function that is independent of the subjects using LWM---as +well as a gossip protocol that detects split-views~\cite{sthnp}. Since our +extension is part of an STH, we piggyback on any gossip-like protocol that deals +with the exchange and/or distribution of (verified) +STHs~\cite{ctga,ietf-gossip,sth-push,cosi}. Our contributions are as follows: +\begin{itemize} + \item The design of a backwards-compatible CT/bis extension for light-weight + monitoring of wild-card prefixes such as \texttt{*.example.com} + (Section~\ref{lwm:sec:lwm}). + \item A security sketch showing that an attacker cannot omit a certificate + notification without being detected, relying on standard cryptographic + assumptions and piggybacking on the proposed gossip-audit models of CT + (Section~\ref{lwm:sec:eval:security}). + \item An open-source proof-of-concept implementation written in Go, as well + as a performance evaluation that considers computation time and bandwidth + requirements (Section~\ref{lwm:sec:eval:perf}). In particular we find that the + overhead during tree head construction is small in comparison to a sound STH + frequency of one hour; a notifier can easily notify 288~M subjects in a + verifiable manner for Google's Icarus log on a single core and a 1~Gbps + connection; and a subject receives about 24~Kb of proofs per day and log + which is verified in negligible time (the order of $\mu$s for the common + case of non-membership, and seconds in the extreme case of verifying + membership for \emph{an entire top-level domain}). +\end{itemize} + +Background on Merkle trees and CT is provided in Section~\ref{lwm:sec:background}. +Related work is discussed in Section~\ref{lwm:sec:eval:related}. +Conclusions are presented in Section~\ref{lwm:sec:conclusion}. diff --git a/summary/src/lwm/src/lwm.tex b/summary/src/lwm/src/lwm.tex new file mode 100644 index 0000000..70641a8 --- /dev/null +++ b/summary/src/lwm/src/lwm.tex @@ -0,0 +1,148 @@ +\section{Light-Weight Monitoring} \label{lwm:sec:lwm} +To reduce the trust which is placed in today's third-party monitors, +the idea of LWM is to lower the barrier towards self-monitoring. As shown in +Figure~\ref{lwm:fig:idea}, an untrusted notifier provides a subject with +efficient\footnote{% + Efficient iff less than a linear number of log entries are received per log + update. +} certificate notifications that can be cryptographically verified: each batch +of certificates is represented by an additional Merkle tree that supports +wild-card (non-)membership queries + (described further in Section~\ref{lwm:sec:lwm:wildcard}), +and the resulting snapshot is signed by the log as part of an STH extension. +As such, a subject can deal only with those certificates that are relevant, +relying on wild-card proofs to verify correctness and completeness: + said certificates are included and nothing is being omitted. +Anyone can check that an LWM snapshot is correct by inspecting the corresponding +batch of certificates. Notably this is \emph{a general monitoring function}, +rather than a \emph{selective notification component} which is verifiable in +LWM. This decoupling allows anyone to be a notifier, including logs and +monitors that a subject distrust. +\begin{figure} + \centering + \input{src/lwm/img/overview} + \caption{% + An overview of LWM. In addition to normal operation, a log creates an + additional (smaller) Merkle tree that supports wild-card (non-)membership + queries. The resulting snapshot is signed as part of an STH extension that + can be verified by any monitor that downloads the corresponding batch. A + subject receives one verifiable certificate notification per STH from an + untrusted notifier. + } + \label{lwm:fig:idea} +\end{figure} + +\subsection{Authenticated Wild-Card Queries} \label{lwm:sec:lwm:wildcard} +Thus far we only discussed Merkle trees in terms of verifying whether a single +value is a (non-)member: + membership is proven by presenting an audit path down to the leaf in question, + while + non-membership requires a lexicographical ordering that allows a verifier + to conclude that a value is absent unless provided in a particular location. +The latter concept naturally extends to \emph{prefix wild-card queries}---such +as $\mathsf{*.example.com}$ and $\mathsf{*.sub.example.com}$---by finding a +suitable ordering function $\Omega$ which ensures that related leaves are +grouped together as a consecutive range. We found that this requirement is +satisfied by sorting on reversed subject names: + suppose that we have a batch of certificates + $\mathsf{example.com}$, + $\mathsf{example.org}$, + $\mathsf{example.net}$, and + $\mathsf{sub.example.com}$. +After applying $\Omega$ we get the static Merkle tree in +Figure~\ref{lwm:fig:wildcard}. A prefix wild-card proof is constructed by +finding the relevant range in question, generating an audit path for the +leaves that are right outside of the range~\cite{range-mt}. Such a proof is +verified by checking that + (i) $\Omega$ indicates that the left (right) end is less (larger) than the + queried prefix, + (ii) the leaves are ordered as dictated by $\Omega$, and +(iii) the recomputed root hash is valid. + +\begin{figure} + \centering + \input{src/lwm/img/wildcard} + \caption{% + Merkle tree where the leaves are ordered on reversed subject names. + } + \label{lwm:fig:wildcard} +\end{figure} + +The exact details of reconstructing the root hash is a bit tedious because there +are several corner cases. For example, either or both of the two audit paths may +be empty depending on batch size (${\leq}1$) and location of the relevant range +(left/right-most side). Therefore, we omit the details and +focus on the concept: + given two audit paths and a sequence of data items ordered by $\Omega$ that + includes the left leaf, matching range, and right leaf, repeatedly + reconstruct interior nodes to the largest extent possible and then use the + sibling hash which is furthest from the root to continue. +For example, consider a proof for $\mathsf{*sub.example.com}$ in +Figure~\ref{lwm:fig:wildcard}: it is composed of + (i) the left leaf data and its audit path $h_0,h_{23}$ on index 1, + (ii) the right leaf data and its audit path $h_2,h_{01}$ on index 3, and +(iii) the matching range itself which is a single certificate. +After verifying $\Omega$~order, recompute the root hash $r'$ and check if +it matches an authentic root $r$ as follows: +\begin{enumerate} + \item Compute leaf hashes $h_1'$, $h_2'$, and $h_3'$ from the provided data. + Next, compute the interior node $h_{23}' \gets \hash(h_2'\concat h_3')$. + Because no additional interior node can be computed without a sibling hash, + consider $h_0$ in the left audit path. + \item Compute the interior node $h_{01}' \gets \hash(h_0\concat h_1')$, then + $r' \gets \hash(h_{01}'\concat h_{23}')$.\footnote{% + Two audit paths may contain redundancy, but we ignored this favouring + simplicity. + } +\end{enumerate} + +Given an $\Omega$~ordered list of certificates it is trivial to locate where +a subject's wild-card matches are: + binary search to find the index of an exact match (if any), then up to + $t$ matches follow in order. +This is not the only way to find the right range and matches. For example, a +radix tree could be used with the main difference being $\bigO{t+\log n}$ +against $\bigO{t+k}$ complexity for a batch of size $n$, a wild-card string of +length $k$, and $t$ matches. Since the complexity of generating two audit +paths is $\bigO{\log n}$ for any number of matches, the final space and time +complexity for a wild-card structure based on an ordered list is +$\bigO{t+\log n}$. + +\subsection{Notifier} \label{lwm:sec:lwm:notifier} +A notifier must obtain every STH to generate wild-card proofs that can be traced +back to the log. Albeit error-prone in case of network issues, the simplest +way to go about this is to poll the log's get-STH endpoint \emph{frequently +enough}.\footnote{%% + It would be better if logs supported verifiable and historical get-STH + queries. +} +Once an updated is spotted every new certificate is downloaded and the wild-card +structure is reconstructed. A subject receives her verifiable certificate +notifications from the notifier via a push (`monitoring as-a-service') or pull +(`self-monitoring') model. For example, emails could be delivered after every +update or in daily digests. Another option is to support queries like + ``what's new since STH~$x$''. + +A subject can verify that a certificate notification is fresh by inspecting the +STH timestamp. However, it is hard to detect missing certificate notifications +unless every STH trivially follows from the previous one. While there are +several methods to achieve this---% + for example using indices (Section~\ref{lwm:sec:lwm:instantiation}) or + hash chains~\cite{coniks}---% +the log must always sign a snapshot per STH using an extension. + +\subsection{Instantiation Example} \label{lwm:sec:lwm:instantiation} +Instantiating LWM depends upon the ability to support an STH extension. In the +latest version of CT, this takes the form of a sorted list of +key-value pairs where the key is unique and the value an opaque byte +array~\cite{ct/bis}. We could reserve the keywords \emph{lwm} for snapshots and +\emph{index} for monotonically increasing counters.\footnote{% + Instead of an index to detect missing notifications (STHs), a log could + announce STHs as part of a verifiable get-STH endpoint. See the sketch of + Nordberg~\cite{nordberg-sketch}. +} Besides an LWM-compliant log, an untrusted notifier must support pushed or +pulled certificate notifications that are verifiable by tracking the most recent +or every wild-card structure. Examples of likely notifiers include + logs (who benefit from the reduced bandwidth) and + monitors (who could market increased transparency) +that already process all certificates regardless of LWM. diff --git a/summary/src/lwm/src/references.bib b/summary/src/lwm/src/references.bib new file mode 100644 index 0000000..f3fe96a --- /dev/null +++ b/summary/src/lwm/src/references.bib @@ -0,0 +1,255 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% References % +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +@misc{patricia, + author = {Ethereum/wiki}, + title = {Patricia Tree}, + howpublished = {\url{https://github.com/ethereum/wiki/wiki/Patricia-Tree}, accessed 2018-08-15}, +} + +@misc{log-growth, + author = {SSLMate}, + title = {{Certificate Transparency} Log Growth}, + howpublished = {\url{https://sslmate.com/labs/ct_growth/}, accessed 2018-08-15}, +} + +@misc{cert-size, + author = {Graham Edgecombe}, + title = {Compressing {X.509} certificates}, + howpublished = {\url{https://www.grahamedgecombe.com/blog/2016/12/22/compressing-x509-certificates}, accessed 2018-08-15}, +} + +@misc{alexa, + author = {Amazon}, + title = {Alexa {top-1M}}, + howpublished = {\url{http://s3.amazonaws.com/alexa-static/top-1m.csv.zip}, accessed 2018-08-05}, +} + +@misc{artifact, + title = {Paper artifact}, + howpublished = {\url{https://github.com/rgdd/lwm}}, + year = {2018}, +} + +@misc{nordberg-sketch, + author = {Linus Nordberg}, + title = {{Re: [Trans] Providing} the history of {STHs} a log has issued (in 6962-bis)}, + howpublished = {\url{https://mailarchive.ietf.org/arch/msg/trans/JbFiwO90PjcYzXrEgh-Y7bFG5Fw}, accessed 2018-09-16}, +} + +@misc{grahmon, + author = {Graham Edgecombe}, + title = {{Certificate Transparency} Monitor}, + howpublished = {\url{https://ct.grahamedgecombe.com/}, accessed 2018-09-15.}, +} + +@misc{fbmon, + author = {Facebook}, + title = {{Certificate Transparency} Monitoring}, + howpublished = {\url{https://developers.facebook.com/tools/ct/}, accessed 2018-09-15}, +} + +@misc{certspotter, + author = {SSLMate}, + title = {Better uptime and security with {Cert Spotter}}, + howpublished = {\url{https://sslmate.com/certspotter/}, accessed 2018-09-15}, +} + +@article{ctga, + author = {Rasmus Dahlberg and + Tobias Pulls and + Jonathan Vestin and + Toke H{\o}iland{-}J{\o}rgensen and + Andreas Kassler}, + title = {Aggregation-Based Gossip for {Certificate Transparency}}, + journal = {CoRR abs/1806.08817}, + year = {2018}, +} + +@misc{enisa, + author = {ENISA}, + title = {Certificate Authorities---The Weak Link of Internet Security}, + howpublished = {\url{https://www.enisa.europa.eu/publications/info-notes/certificate-authorities-the-weak-link-of-internet-security}, accessed 2018-09-16}, +} + +@inproceedings{sthnp, + author = {Laurent Chuat and Pawel Szalachowski and Adrian Perrig and Ben Laurie and Eran Messeri}, + title = {Efficient gossip protocols for verifying the consistency of Certificate logs}, + booktitle = {CNS}, + year = {2015}, +} + +@misc{sth-push, + author = {Ryan Sleevi and Eran Messeri}, + title = {{Certificate Transparency} in {Chrome}: Monitoring {CT} Logs consistency}, + howpublished = {\url{https://docs.google.com/document/d/1FP5J5Sfsg0OR9P4YT0q1dM02iavhi8ix1mZlZe_z-ls/edit?pref=2&pli=1}, accessed 2018-09-16}, +} + +@misc{chrome-policy, + author = {Ben Laurie}, + title = {Improving the Security of {EV} Certificates}, + howpublished = {\url{https://goo.gl/DdEKz1}, accessed 2018-09-16}, +} + +@misc{vds, + author = {Adam Eijdenberg and Ben Laurie and Al Cutter}, + title = {Verifiable Data Structures}, + howpublished = {\url{https://github.com/google/trillian/blob/master/docs/VerifiableDataStructures.pdf}, accessed 2018-09-16}, +} + +@inproceedings{cosi, + author = {Ewa Syta and + Iulia Tamas and + Dylan Visher and + David Isaac Wolinsky and + Philipp Jovanovic and + Linus Gasser and + Nicolas Gailly and + Ismail Khoffi and + Bryan Ford}, + title = {Keeping Authorities ``Honest or Bust'' with Decentralized Witness Cosigning}, + booktitle = {{IEEE S\&P}}, + year = {2016}, + month = {May}, +} + +@inproceedings{ads, + author = {Roberto Tamassia}, + title = {Authenticated Data Structures}, + booktitle = {{ESA}}, + year = {2003}, +} + +@inproceedings{history-independence, + author = {Moni Naor and Vanessa Teague}, + title = {Anti-persistence: History Independent Data Structures}, + booktitle = {{STOC}}, + year = {2001}, +} + +@techreport{ct, + author = {Ben Laurie and Adam Langley and Emilia Kasper}, + title = {{Certificate Transparency}}, + type = {RFC}, + number = {6962}, + institution = {IETF}, + year = {2013}, +} + +@techreport{dns-name-ref, + author = {Mockapetris, Paul}, + title = {Domain Names---Implementation and specification}, + type = {RFC}, + number = {1035}, + institution = {IETF}, + year = {1987}, +} + +@techreport{ct/bis, + author = {Ben Laurie and Adam Langley and Emilia Kasper and Eran Messeri and Rob Stradling}, + title = {{Certificate Transparency} Version 2.0}, + number = {draft-ietf-trans-rfc6962-bis-28}, + type = {Internet-draft}, + institution = {IETF}, + year = {2018}, +} + +@techreport{ietf-gossip, + author = {Linus Nordberg and Daniel Kahn Gillmor and Tom Ritter}, + title = {Gossiping in {CT}}, + number = {draft-ietf-trans-gossip-05}, + type = {Internet-draft}, + institution = {IETF}, + year = {2018}, +} + +@inproceedings{balloon, + author = {Tobias Pulls and Roel Peeters}, + title = {Balloon: A Forward-Secure Append-Only Persistent Authenticated Data Structure}, + booktitle = {{ESORICS}}, + year = {2015}, +} + +@inproceedings{mt, + author = {Ralph C. Merkle}, + title = {A Digital Signature Based on a Conventional Encryption Function}, + booktitle = {{CRYPTO}}, + year = {1987}, +} + +@inproceedings{coniks, + author = {Marcela S. Melara and + Aaron Blankstein and + Joseph Bonneau and + Edward W. Felten and + Michael J. Freedman}, + title = {{CONIKS:} {Bringing} Key Transparency to End Users}, + booktitle = {{USENIX} Security}, + year = {2015}, +} + +@inproceedings{aki, + author = {Tiffany Hyun{-}Jin Kim and + Lin{-}Shung Huang and + Adrian Perrig and + Collin Jackson and + Virgil D. Gligor}, + title = {Accountable key infrastructure {(AKI):} {A} proposal for a public-key validation infrastructure}, + booktitle = {{WWW}}, + year = {2013}, +} + +@inproceedings{history-tree, + author = {Scott A. Crosby and Dan S. Wallach}, + title = {Efficient Data Structures For Tamper-Evident Logging}, + booktitle = {{USENIX} Security}, + year = {2009}, +} + +@article{pad, + author = {Scott A. Crosby and Dan S. Wallach}, + title = {Authenticated Dictionaries: {Real}-World Costs and Trade-Offs}, + journal = {{ACM} {TISSEC}}, + volume = {14}, + number = {2}, + year = {2011}, +} + +@inproceedings{katz, + author = {Jonathan Katz}, + title = {Analysis of a Proposed Hash-Based Signature Standard}, + booktitle = {{SSR}}, + year = {2016}, +} + +@inproceedings{ca-ecosystem, + author = {Zakir Durumeric and + James Kasten and + Michael Bailey and + J. Alex Halderman}, + title = {Analysis of the {HTTPS} certificate ecosystem}, + booktitle = {{IMC}}, + year = {2013}, +} + +@inproceedings{accumulator, + author = {David Derler and Christian Hanser and Daniel Slamanig}, + title = {Revisiting Cryptographic Accumulators, Additional Properties and + Relations to Other Primitives}, + booktitle = {{CT-RSA}}, + year = {2015}, +} + +@inproceedings{crt, + author = {Paul C. Kocher}, + title = {On Certificate Revocation and Validation}, + booktitle = {{FC}}, + year = {1998}, +} + +@inproceedings{range-mt, + author = {Glen Nuckolls}, + title = {Verified Query Results from Hybrid Authentication Trees}, + booktitle = {{IFIP} {WG} 11.3 Working Conference on Data and Application Security}, + year = {2005}, +} diff --git a/summary/src/other.tex b/summary/src/other.tex new file mode 100644 index 0000000..dda33d4 --- /dev/null +++ b/summary/src/other.tex @@ -0,0 +1,36 @@ +\section*{List of Other Contributions} + +Throughout and before my PhD studies, I also contributed to the following: + +\begin{itemize} + \item The Sigsum Project. + \url{https://www.sigsum.org/} (2020-present). + + Sigsum is a free and open-source software project that brings transparency + to signed checksums. The design is simple and uses a proactive gossip-audit + model. I founded Sigsum with Linus Nordberg and Fredrik Str\"{o}mberg while + working at Mullvad VPN as a side-line occupation. + + \item Tobias Pulls and \textbf{Rasmus Dahlberg}. + Steady: + A Simple End-to-End Secure Logging System. + NordSec (2018). + + Tobias was the main driver. I mostly contributed with design discussions + and a security formalization. + + \item \textbf{Rasmus Dahlberg}, Tobias Pulls, and Roel Peeters. + Efficient Sparse Merkle Trees: + Caching Strategies and Secure (Non-)Membership Proofs. + NordSec (2016). + + This work started with my Bachelor's thesis. I did most writing with + input from Tobias and Roel. Tobias did our benchmarking experiments. + + \item \textbf{Rasmus Dahlberg} and Tobias Pulls. + Standardized Syslog Processing: + Revisiting Secure Reliable Data Transfer and Message Compression. + Technical Report, Karlstad University (2016). + + I was the main driver. Tobias contributed with continuous feedback. +\end{itemize} diff --git a/summary/src/sammanfattning.tex b/summary/src/sammanfattning.tex new file mode 100644 index 0000000..dbc68d5 --- /dev/null +++ b/summary/src/sammanfattning.tex @@ -0,0 +1,41 @@ +Projektet \emph{Certificate Transparency} är ett ekosystem av loggar, övervakare +och granskare som håller certifikatutfärdare till svars för utfärdade +webbcertifikat. Vi visar hur säkerheten kan höjas i ekosystemet för både +domäninnehavare och TLS-klienter i nuvarande system samt som del av +anonymitetsnätverket Tor. Bland våra större bidrag är + förbättrad övervakning av loggarna, + ett skvallerprotokoll som integrerats med DNS, + ett skvaller- och granskningsprotokoll som utformats specifikt för Tors + webbläsare och + ett förslag på hur domännamn med adresser i Tor kan bli mer tillgängliga. +De metoder som använts varierar från säkerhetsbevis till internetmätningar och +utvärderingar av forskningsprototyper. +En viktig del av vår utvärdering i Tor är att avgöra hur protokoll som +används av webbläsare påverkar möjligheten att koppla ihop användare med besökta +webbplatser. Detta inkluderar existerande protokoll samt nya tillägg för att +verifiera om webbplatsers certifikat är transparensloggade. Våra resultat visar +att i många fall kan falska positiva utslag filtreras bort vid +mönsterigenkänning av Tor-användares krypterade trafik +(eng: \emph{website fingerprinting}). +Orsaken är att besök till de flesta webbplatser kan uteslutas till följd av hur +internetprotokoll fungerar: kommunikation är observerbar och involverar ofta +interaktioner med tredjeparter. Vissa protokoll har dessutom sidokanaler som +kan analyseras. Vi visar exempelvis att Tors DNS-cache kan undersökas med olika +varianter av tidtagningsattacker. Dessa attacker är enkla att utföra över +internet och avslöjar vilka domännamn som slagits upp vid angivna tidpunkter. +De förbättrade mönsterigenkänningsattackerna mot webbplatser är realistiska och +hotar därför Tors anonymitet. +Vår slutsats är att framtida försvar bör utvärderas utifrån att angripare har +tillgång till ett så kallat webbplatsorakel. + +\paragraph{Nyckelord:} + Granskning, + Certificate Transparency, + DNS, + Skvaller, + Sidokanaler, + Tidtagningsattacker, + Tor, + Tors webbläsare, + Mönsterigenkänning,% + Webbplatsorakel diff --git a/summary/src/sauteed/.gitignore b/summary/src/sauteed/.gitignore new file mode 100644 index 0000000..8bb88c8 --- /dev/null +++ b/summary/src/sauteed/.gitignore @@ -0,0 +1,9 @@ +main.pdf +*.blg +*.bbl +*.fls +*.fdb_latexmk +*.log +*.out +*.aux +*.swp diff --git a/summary/src/sauteed/img/onion-location.pdf b/summary/src/sauteed/img/onion-location.pdf new file mode 100644 index 0000000..21fde5f Binary files /dev/null and b/summary/src/sauteed/img/onion-location.pdf differ diff --git a/summary/src/sauteed/img/onion-search.pdf b/summary/src/sauteed/img/onion-search.pdf new file mode 100644 index 0000000..5867270 Binary files /dev/null and b/summary/src/sauteed/img/onion-search.pdf differ diff --git a/summary/src/sauteed/main.tex b/summary/src/sauteed/main.tex new file mode 100644 index 0000000..e0c7cda --- /dev/null +++ b/summary/src/sauteed/main.tex @@ -0,0 +1,64 @@ +\begin{kaupaper}[ + author={% + \textbf{Rasmus Dahlberg}, + Paul Syverson, + Linus Nordberg, and + Matthew Finkel + }, + title={% + Sauteed Onions: Transparent Associations from Domain Names to Onion Addresses + }, + reference={% + WPES (2022) + }, + summary={% + Many prominent websites are also hosted as Tor onion services. Onion + services are identified by their public keys and subject to onion routing, + thus offering self-authenticated connections and censorship resistance. + However, the non-mnemonic names are a limitation due to being hard to + discover and remember. We explore how certificates with onion addresses + may improve the status quo by proposing sauteed onions, \emph{transparent + associations from domain names to onion addresses} with the help of + Certificate Transparency logs. The idea is to extend a website's regular + certificate with an associated onion address. This makes it possible to + offer certificate-based onion location that is no less targeted than the + HTTPS connection facilitating the discovery, as well as name-to-onion + search engines that use the append-only logs for verifiable population of + their databases. The achieved goals are + consistency of available onion associations, + improved third-party discovery of onion associations, and + forward censorship-resistance. + To be discovered, sites must opt-in by obtaining a sauteed onion + certificate. Our prototypes for certificate-based onion location and + third-party search engines use an existing backward-compatible format. We + discuss this trade-off and note that a certificate extension may be used + in the future. + }, + participation={\vspace{-.25cm} + Paul, Linus, and I had the initial idea of exploring how onion addresses + fit into Certificate Transparency. Paul and I did most of the writing. I + implemented our monitor, Linus our search engine, Matt our web extension. + }, + label={ + paper:sauteed + }, +] + \maketitle + \begin{abstract} + \input{src/sauteed/src/abstract} + \end{abstract} + + \input{src/sauteed/src/intro} + \input{src/sauteed/src/preliminaries} + \input{src/sauteed/src/sauteed} + \input{src/sauteed/src/related} + \input{src/sauteed/src/conc} + \input{src/sauteed/src/acks} + + \bibliographystyle{plain} + \bibliography{src/sauteed/src/refs} + + \begin{appendices} + \input{src/sauteed/src/appendix} + \end{appendices} +\end{kaupaper} diff --git a/summary/src/sauteed/src/abstract.tex b/summary/src/sauteed/src/abstract.tex new file mode 100644 index 0000000..8bdcd81 --- /dev/null +++ b/summary/src/sauteed/src/abstract.tex @@ -0,0 +1,22 @@ +\noindent +Onion addresses offer valuable features such as lookup and routing +security, self-authenticated connections, and censorship resistance. +Therefore, many websites are also available as onionsites in Tor. The way +registered domains and onion addresses are associated is however a weak link. +We introduce \emph{sauteed onions}, \emph{transparent associations from domain +names to onion addresses}. +Our approach relies on TLS certificates to establish onion associations. It is +much like today's onion location which relies on Certificate Authorities (CAs) +due to its HTTPS requirement, but has the added benefit of becoming public for +everyone to see in Certificate Transparency (CT) logs. We propose and prototype +two uses of sauteed onions: + certificate-based onion location and + search engines that use CT logs as the underlying database. +The achieved goals are + \emph{consistency of available onion associations}, which mitigates attacks + where users are partitioned depending on which onion addresses they are + given, + \emph{forward censorship-resistance} after a TLS site has been configured + once, and + \emph{improved third-party discovery of onion associations}, which requires + less trust while easily scaling to all onionsites that opt-in. diff --git a/summary/src/sauteed/src/acks.tex b/summary/src/sauteed/src/acks.tex new file mode 100644 index 0000000..590558a --- /dev/null +++ b/summary/src/sauteed/src/acks.tex @@ -0,0 +1,9 @@ +\section*{Acknowledgments} +We would like to thank + Kushal Das, + Daniel Kahn Gillmor, + Silvio Rhatto, and + Tobias Pulls +for helpful discussions and comments. +Rasmus Dahlberg was supported by the Swedish Foundation for Strategic Research. +Paul Syverson was supported by ONR\@. diff --git a/summary/src/sauteed/src/appendix.tex b/summary/src/sauteed/src/appendix.tex new file mode 100644 index 0000000..7850d49 --- /dev/null +++ b/summary/src/sauteed/src/appendix.tex @@ -0,0 +1,79 @@ +\section{Onion Association Search Examples} \label{sauteed:app:search} +We host the search engine described in Section~\ref{sauteed:sec:search-engine} on a +Debian VM with 1GB RAM, 20GB SSD, and a single vCPU. It is available at +\texttt{api.sauteed-onions.o +rg} as well as +\texttt{zpadxxmoi42k45iifrzuktwqktihf5didbaec3xo4dhvlw2hj54 +doiqd.onion}. +Please note that we operate this prototype on a best-effort level until +December, 2022. + +An example for the \texttt{search} endpoint is provided in +Figure~\ref{sauteed:fig:search}, followed by extracting additional certificate +information using the \texttt{get} endpoint in Figure~\ref{sauteed:fig:get}. There are +many CT-logged certificates for the same onion association because certificates +are renewed periodically and typically submitted to multiple CT logs. + +\begin{sidewaysfigure}[!t] + \centering + \begin{lstlisting} + $ curl -s https://api.sauteed-onions.org/search?in=www.sauteed-onions.org | json_pp + [ + { + "identifiers" : [ + "2", + "3", + "24", + "25", + "28", + "29", + "37" + ], + "onion_addr" : "qvrbktnwsztjnbga6yyjbwzsdjw7u5a6vsyzv6hkj75clog4pdvy4cyd.onion", + "domain_name" : "www.sauteed-onions.org" + } + ] + \end{lstlisting} + \caption{Find onion associations for \texttt{www.sauteed-onions.org}.} + \label{sauteed:fig:search} +\end{sidewaysfigure} + +\begin{sidewaysfigure}[!t] + \centering + \begin{lstlisting} + $ curl -s https://api.sauteed-onions.org/get?id=2 | json_pp + { + "onion_addr" : "qvrbktnwsztjnbga6yyjbwzsdjw7u5a6vsyzv6hkj75clog4pdvy4cyd.onion", + "domain_name" : "www.sauteed-onions.org", + "log_id" : "b1N2rDHwMRnYmQCkURX/dxUcEdkCwQApBo2yCJo32RM=", + "log_index" : 582362461, + "cert_path" : "db/logs/Mammoth/582362461.pem" + } + $ curl -L https://api.sauteed-onions.org/db/logs/Mammoth/582362461.pem | \ + openssl x509 -text -noout + ... + \end{lstlisting} + \caption{Get further information relating to the certificate with identifier ``2''.} + \label{sauteed:fig:get} +\end{sidewaysfigure} + +\section{Configuration Example} \label{sauteed:app:setup} + +We used \texttt{certbot} to set up sauteed onions using Let's Encrypt and +\texttt{apache} on a Debian system. The difference when +compared to the usual \texttt{certbot} instructions is that the \texttt{-d} flag +must be specified to enumerate all SANs as a comma-separated +list~\cite{certbot}. The domain name with an associated onion address as a +subdomain also needs to be reachable via DNS for Let's Encrypt to perform domain +validation. Therefore, an appropriate A/AAAA or CNAME record is required. A +sanity-check for \texttt{www.sauteed-onions.org} would be to verify that + \texttt{dig qvrbktnwsztjnbga6yyjbwzsdjw7u5a6vsyzv6hkj75clog4pdvy4cydonion + .www.sauteed-onions.org} +returns the same IP address as + \texttt{dig www.sauteed + -onions.org} +before running + \texttt{certbot --apache -d~www.sauteed-onions. + org,qvrbktnwsztjnbga6yyjbwzsdjw7u5a6vsyzv6hkj75clog4pdvy4cydo + nion.www.sauteed-onions.org}. +See \texttt{crt.sh} for an example certificate~\cite{sauteed-onion-cert}. diff --git a/summary/src/sauteed/src/conc.tex b/summary/src/sauteed/src/conc.tex new file mode 100644 index 0000000..2b26a56 --- /dev/null +++ b/summary/src/sauteed/src/conc.tex @@ -0,0 +1,16 @@ +\section{Conclusion} \label{sauteed:sec:conclusion} +Sauteed onions declare unidirectional associations from domain names to onion +addresses. These onion associations are established in CA-issued and CT-logged +TLS certificates, thereby making them public for everyone to see. We propose +two immediate applications: + certificate-based onion location and + more automated verifiable search. +Both applications are opt-in for domain owners, and rely on similar assumptions +as today's onion location. The added benefit is more transparency, which +facilitates a higher degree of consistency between found onion associations as +well as more censorship-resistance for TLS sites after setup. Configuration of +sauteed onions requires one more DNS record and a domain-validated certificate +from any CA (such as Let's Encrypt). In the future, the additional DNS record +may be replaced by an X.509v3 extension. We leave it as a fun exercise to find +the onion address of a TLS site that is intentionally being censored by us: +\texttt{blocked.sauteed-onions.org}. diff --git a/summary/src/sauteed/src/intro.tex b/summary/src/sauteed/src/intro.tex new file mode 100644 index 0000000..bfad238 --- /dev/null +++ b/summary/src/sauteed/src/intro.tex @@ -0,0 +1,45 @@ +\section{Introduction} \label{sauteed:sec:intro} +Onion addresses are domain names with many useful properties. For example, an +onion address is self-authenticated due to encoding its own public key. It +also makes integral use of the anonymity network Tor to provide secure and +private lookups as well as routing~\cite{tor-design}. A major usability concern +is that onion addresses are random-looking strings; they are difficult to +discover, update, and remember~\cite{winter}. Existing solutions approach these +limitations in different ways, e.g., ranging from setting onion addresses in +HTTP headers over HTTPS with so-called \emph{onion +location}~\cite{onion-location} and bookmarking found results to making use of +manually curated third-party +lists~\cite{muffet-onions,onion-service-overview,h-e-securedrop} as well as +search engines like DuckDuckGo or \texttt{ahmia.fi}~\cite{nurmi,winter}. + +Herein we refer to the unidirectional association from a domain name to an onion +address as an \emph{onion association}. The overall goal is to facilitate +transparent discovery of onion associations. To achieve this we rely on the +observation that today's onion location can be implemented in certificates +issued by Certificate Authorities (CAs). This is not an additional dependency +because onion location already requires HTTPS~\cite{onion-location}. The main +benefit of transitioning from HTTP headers to TLS certificates is that all such +onion associations become signed and sequenced in tamper-evident Certificate +Transparency (CT) logs~\cite{ct/a,ct-rfc}, further tightening the relation +between CAs and onion keys~\cite{cab-ballot144,cab-onion-dv,secdev19} as well as +public CT logging and Tor~\cite{ctor-popets,muffet-onions}. + +Our first contribution is to make onion associations identical for all Tor +users, and otherwise the possibility of inconsistencies becomes public via CT. +Consistency of available onion associations mitigates the threat of users +being partitioned without anyone noticing into subsets according to which +onion address they received during onion association. +Our second contribution is to construct a search engine that allows Tor users to +look up onion associations without having to trust the service provider +completely. Other than being helpful to validate onion addresses as +authentic~\cite{winter}, such discovery can continue to work \emph{after} a TLS +site becomes censored. + +Section~\ref{sauteed:sec:preliminaries} briefly covers CT preliminaries. +Section~\ref{sauteed:sec:trans} describes \emph{sauteed onions}, an approach that makes +discovery of onion associations more transparent and censorship-resistant +compared to today. Section~\ref{sauteed:sec:related} discusses related work. +Section~\ref{sauteed:sec:conclusion} concludes the paper. +Appendix~\ref{sauteed:app:search} contains query examples for our search engine. +Appendix~\ref{sauteed:app:setup} outlines an example setup. +All artifacts are online~\cite{sauteed-onion-artifacts}. diff --git a/summary/src/sauteed/src/preliminaries.tex b/summary/src/sauteed/src/preliminaries.tex new file mode 100644 index 0000000..f01aeba --- /dev/null +++ b/summary/src/sauteed/src/preliminaries.tex @@ -0,0 +1,25 @@ +\section{Certificate Logging Preliminaries} \label{sauteed:sec:preliminaries} +CT is a system of public append-only logs that store TLS certificates issued by +trusted CAs~\cite{ct/a,ct-rfc}. If web browsers add the criterion that a +certificate must be logged before accepting it as valid, certificate issuance +practices by CAs effectively become transparent so that mistakes and malfeasance +can be detected by anyone that observes the logs. These observers are +called \emph{monitors} because they download every certificate from all logs. +One can self-host a monitor, or use a third-party service like +\texttt{crt.sh}, or follow other models based on +subscriptions~\cite{lwm,li}. To avoid introducing more parties that are trusted +blindly as in the CA ecosystem, CT stands on a cryptographic foundation that +permits efficient verification of inclusion (a certificate is in the log) and +the append-only property (no certificate has been removed or +modified)~\cite{ct-foundation}. A party engaging in verification of these +(logarithmic) proofs is called an \emph{auditor}. + +In practice, CT has been rolled-out gradually to not break the +web~\cite{does-ct-break-the-web}. One facilitating factor has been the +introduction of Signed Certificate Timestamps (SCTs). An SCT is a log's +\emph{promise} to include a certificate within a certain amount of time; +typically 24~hours. This guarantees low-latency certificate issuance so that +CAs can \emph{embed} SCTs in certificates to keep web servers oblivious to CT. +Google Chrome and Apple's Safari require SCTs before accepting a certificate as +valid, and steps towards further SCT verification have been taken +recently~\cite{ct-in-chrome}. Tor Browser does not require CT yet~\cite{ctor-popets}. diff --git a/summary/src/sauteed/src/refs.bib b/summary/src/sauteed/src/refs.bib new file mode 100644 index 0000000..876fed4 --- /dev/null +++ b/summary/src/sauteed/src/refs.bib @@ -0,0 +1,325 @@ +@misc{ahmia.fi, + author = {Ahmia}, + title = {Indexing and crawling}, + howpublished = {\url{https://ahmia.fi/documentation/indexing/}, accessed 2022-08-01}, +} + +@misc{sauteed-onion-artifacts, + title = {Paper artifact}, + howpublished = {\url{https://gitlab.torproject.org/tpo/onion-services/sauteed-onions}}, + year = {2022} +} + +@inproceedings{le, + author = {Josh Aas and Richard Barnes and Benton Case and Zakir Durumeric and Peter Eckersley and Alan Flores{-}L{\'{o}}pez and J. Alex Halderman and Jacob Hoffman{-}Andrews and James Kasten and Eric Rescorla and Seth D. Schoen and Brad Warren}, + title = {{Let's Encrypt}: An Automated Certificate Authority to Encrypt the Entire Web}, + booktitle = {{CCS}}, + year = {2019}, +} + +@inproceedings{le-multi-path, + author = {Henry Birge{-}Lee and Liang Wang and Daniel McCarney and Roland Shoemaker and Jennifer Rexford and Prateek Mittal}, + title = {Experiences Deploying Multi-Vantage-Point Domain Validation at {Let's Encrypt}}, + booktitle = {{USENIX} Security}, + year = {2021}, +} + +@Misc{cab-ballot144, + author = {\relax{{CA}/Browser Forum}}, + title = {Ballot 144 -- Validation rules for .onion names}, + howpublished = {\url{https://cabforum.org/2015/02/18/ballot-144-validation-rules-dot-onion-names/}, accessed 2022-08-01}, +} + +@misc{cab-onion-dv, + author = {\relax{{CA}/Browser Forum}}, + title = {Ballot SC27v3: Version 3 Onion Certificates}, + howpublished = {\url{https://cabforum.org/2020/02/20/ballot-sc27v3-version-3-onion-certificates/}, accessed 2022-08-01}, +} + +@inproceedings{chuat-gossip, + author = {Laurent Chuat and Pawel Szalachowski and Adrian Perrig and Ben Laurie and Eran Messeri}, + title = {Efficient gossip protocols for verifying the consistency of Certificate logs}, + booktitle = {{CNS}}, + year = {2015}, +} + +@inproceedings{lwm, + author = {Rasmus Dahlberg and Tobias Pulls}, + title = {Verifiable Light-Weight Monitoring for {Certificate Transparency} Logs}, + booktitle = {NordSec}, + year = {2018}, +} + +@inproceedings{smt, + author = {Rasmus Dahlberg and Tobias Pulls and Roel Peeters}, + title = {Efficient Sparse {Merkle} Trees - Caching Strategies and Secure (Non-)Membership Proofs}, + booktitle = {NordSec}, + year = {2016}, +} + +@article{ctor-popets, + author = {Rasmus Dahlberg and Tobias Pulls and Tom Ritter and Paul Syverson}, + title = {Privacy-Preserving \& Incrementally-Deployable Support for {Certificate Transparency} in {Tor}}, + journal = {PETS}, + volume = {2021}, + number = {2}, +} + +@misc{digicert-onion, + author = {\relax{DigiCert Inc.}}, + title = {Ordering a .onion certificate from {DigiCert}}, + howpublished = {\url{https://www.digicert.com/blog/ordering-a-onion-certificate-from-digicert}, accessed 2022-08-01}, +} + +@inproceedings{tor-design, + title = {Tor: The Second-Generation Onion Router}, + author = {Roger Dingledine and Nick Mathewson and Paul Syverson}, + booktitle = {USENIX Security}, + year = {2004}, +} + +@inproceedings{ct-foundation, + author = {Benjamin Dowling and Felix G{\"{u}}nther and Udyani Herath and Douglas Stebila}, + title = {Secure Logging Schemes and {Certificate Transparency}}, + booktitle = {{ESORICS}}, + year = {2016}, +} + +@misc{vds, + author = {Adam Eijdenberg and Ben Laurie and Al Cutter}, + title = {Verifiable Data Structures}, + howpublished = {\url{https://github.com/google/trillian/blob/111e9369ab032e493a2f19f9be6d16c4f78ccca5/docs/papers/VerifiableDataStructures.pdf}, accessed 2022-08-01}, +} + +@misc{certbot, + key = {EFF}, + title = {Changing a Certificate's Domain}, + howpublished = {\url{https://eff-certbot.readthedocs.io/en/stable/using.html\#changing-a-certificate-s-domains}, accessed 2022-08-01}, +} + +@misc{fink, + author = {Alex Fink}, + title = {Mnemonic .onion {URLs}}, + howpublished = {\url{https://gitweb.torproject.org/torspec.git/tree/proposals/194-mnemonic-urls.txt}, accessed 2022-08-01}, +} + +@techreport{dangerous-labels, + author = {Daniel Kahn Gillmor}, + title = {{Dangerous Labels in DNS and E-mail}}, + type = {Internet-Draft}, + number = {draft-dkg-intarea-dangerous-labels-01}, + institution = {IETF}, + year = {2022}, +} + +@misc{chrome-logs, + title = {{Google LLC.}}, + key = {Known Logs}, + howpublished = {\url{https://github.com/google/certificate-transparency-community-site/blob/master/docs/google/known-logs.md}, accessed 2022-08-01}, +} + +@misc{harica-onion, + author = {\relax{Harica}}, + title = {{DV} certificates for Onion websites}, + howpublished = {\url{https://news.harica.gr/article/onion_announcement/}, accessed 2022-08-01}, +} + +@misc{kadianakis, + author = {George Kadianakis and Yawning Angel and David Goulet}, + title = {A Name System {API} for {Tor} Onion Services}, + howpublished = {\url{https://gitweb.torproject.org/torspec.git/tree/proposals/279-naming-layer-api.txt}, accessed 2022-08-01}, +} + +@article{ct/a, + author = {Ben Laurie}, + title = {Certificate transparency}, + journal = {CACM}, + volume = {57}, + number = {10}, + year = {2014}, +} + +@misc{trans-laurie, + title = {{Re: [Trans] Mozilla's} basic take on Binary Transparency}, + author = {Ben Laurie}, + howpublished = {\url{https://mailarchive.ietf.org/arch/msg/trans/1FxzTkn4LVxU6KN2P3YfbVsKpho/}, accessed 2022-08-01}, +} + +@techreport{ct-rfc, + author = {Ben Laurie and Adam Langley and Emilia Kasper}, + title = {{Certificate Transparency}}, + type = {RFC}, + number = {6962}, + institution = {IETF}, + year = {2013}, +} + +@inproceedings{li, + author = {Bingyu Li and Jingqiang Lin and Fengjun Li and Qiongxiao Wang and Qi Li and Jiwu Jing and Congli Wang}, + title = {{Certificate Transparency} in the Wild: Exploring the Reliability of Monitors}, + booktitle = {{CCS}}, + year = {2019}, +} + +@inproceedings {coniks, + author = {Marcela S. Melara and Aaron Blankstein and Joseph Bonneau and Edward W. Felten and Michael J. Freedman}, + title = {{CONIKS}: Bringing Key Transparency to End Users}, + booktitle = {USENIX Security}, + year = {2015}, +} + +@misc{russia-blocks, + author = {Simon Migliano and Samuel Woodhams}, + title = {Websites Blocked in {Russia} Since {Ukraine} Invasion}, + howpublished = {\url{https://www.top10vpn.com/research/websites-blocked-in-russia/}, accessed 2022-08-01}, +} + +@misc{vanity-address, + title = {mkp224o---vanity address generator for ed25519 onion services}, + howpublished = {\url{https://github.com/cathugger/mkp224o}, accessed 2022-08-01}, +} + +@misc{mozilla-bt, + author = {Mozilla}, + title = {Security/Binary Transparency}, + howpublished = {\url{https://wiki.mozilla.org/Security/Binary_Transparency}, accessed 2022-08-01}, +} + +@misc{muffet-onions, + author = {Alec Muffett}, + title = {Onion {Certificate Transparency} Log}, + howpublished = {\url{https://github.com/alecmuffett/real-world-onion-sites}, accessed 2022-08-01}, +} + +@misc{sooc, + author = {Alec Muffett}, + title = {Same Origin Onion Certificates}, + howpublished = {\url{https://crt.sh/?id=6819596552}, accessed 2022-08-01}, +} + +@misc{namecoin, + title = {Namecoin}, + howpublished = {\url{https://www.namecoin.org/}, accessed 2022-08-01}, +} + +@misc{nordberg-tor, + author = {Linus Nordberg}, + title = {{Tor} Consensus Transparency}, + howpublished = {\url{https://gitweb.torproject.org/torspec.git/tree/proposals/267-tor-consensus-transparency.txt}, accessed 2022-08-01}, +} + +@phdthesis{nurmi, + author = {Nurmi, Juha}, + title = {Understanding the Usage of Anonymous Onion Services}, + year = {2019}, + school = {Tampere University, Finland}, +} + +@Misc{haroi-tor-dev, + author = {nusenu}, + title = {{HAROI}: Human Readable Authenticated Relay Operator Identifier}, + howpublished = {\url{https://lists.torproject.org/pipermail/tor-dev/2021-December/014688.html}, accessed 2022-08-01}, +} + +@misc{onion-location, + author = {Tor Project}, + title = {Onion-Location}, + howpublished = {\url{https://community.torproject.org/onion-services/advanced/onion-location/}, accessed 2022-08-01}, +} + +@misc{onion-service-overview, + author = {Tor Project}, + title = {Onion Services}, + howpublished = {\url{https://community.torproject.org/onion-services/}, accessed 2022-08-01}, +} + +@misc{rhatto, + author = {Silvio Rhatto}, + title = {Sauteed Week {API} Backend}, + howpublished = {\url{https://gitlab.torproject.org/rhatto/sauteed-week/-/blob/main/docs/api.md}, accessed 2022-08-01}, +} + +@misc{sauteed-onion-cert, + title = {Sauteed Onion Certificate}, + howpublished = {\url{https://crt.sh/?id=5957691193}, accessed 2022-08-01}, +} + +@inproceedings{onion-dns, + author = {Nolen Scaife and Henry Carter and Patrick Traynor}, + title = {{OnionDNS}: A seizure-resistant top-level Domain}, + booktitle = {{CNS}}, + year = {2015}, +} + +@article{ct-in-chrome, + author = {Emily Stark and Joe DeBlasio and Devon O'Brien and Davide Balzarotti and William Enck and Samuel King and Angelos Stavrou}, + title = {{Certificate Transparency} in {Google Chrome}: Past, Present, and Future}, + journal = {{IEEE} Secur. Priv.}, + volume = {19}, + number = {6}, + year = {2021}, +} + +@inproceedings{does-ct-break-the-web, + author = {Emily Stark and Ryan Sleevi and Rijad Muminovic and Devon O'Brien and Eran Messeri and Adrienne Porter Felt and Brendan McMillion and Parisa Tabriz}, + title = {Does {Certificate Transparency} Break the Web? {Measuring} Adoption and Error Rate}, + booktitle = {{IEEE S\&P}}, + year = {2019}, +} + +@inproceedings{once-and-future, + author = {Paul Syverson}, + title = {The Once and Future Onion}, + booktitle = {ESORICS}, + year = {2017}, +} + +@inproceedings{onion-discovery-attacks, + author = {Paul Syverson and Matt Finkel and Saba Eskandarian and Dan Boneh}, + title = {Attacks on Onion Discovery and Remedies via Self-Authenticating Traditional Addresses}, + booktitle = {WPES}, + year = {2021}, +} + +@inproceedings{secdev19, + author = {Paul Syverson and Matt Traudt}, + title = {Self-Authenticating Traditional Domain Names}, + booktitle = {{SecDev}}, + year = {2019}, +} + +@misc{plex, + title = {How {Plex} is doing {HTTPS} for all its users}, + author = {Filippo Valsorda}, + howpublished = {\url{https://words.filippo.io/how-plex-is-doing-https-for-all-its-users/}, accessed 2022-08-01}, +} + +@Misc{h-e-securedrop, + author = {SecureDrop}, + title = {Getting an Onion Name for Your {SecureDrop}}, + howpublished = {\url{https://securedrop.org/faq/getting-onion-name-your-securedrop/}, accessed 2022-08-01}, +} + +@article{onio-ns, + author = {Jesse Victors and Ming Li and Xinwen Fu}, + title = {The Onion Name System}, + journal = {PETS}, + volume = {2017}, + number = {1}, +} + +@inproceedings{winter, + author = {Philipp Winter and Anne Edmundson and Laura M. Roberts and Agnieszka Dutkowska{-}Zuk and Marshini Chetty and Nick Feamster}, + title = {How Do {Tor} Users Interact With Onion Services?}, + booktitle = {{USENIX} Security}, + year = {2018}, +} + +@techreport{nordberg-gossip, + author = {Linus Nordberg and Daniel Kahn Gillmor and Tom Ritter}, + title = {Gossiping in {CT}}, + type = {Internet-draft}, + number = {draft-ietf-trans-gossip-05}, + institution = {IETF}, + year = {2018}, +} diff --git a/summary/src/sauteed/src/related.tex b/summary/src/sauteed/src/related.tex new file mode 100644 index 0000000..d772d0d --- /dev/null +++ b/summary/src/sauteed/src/related.tex @@ -0,0 +1,62 @@ +\section{Related Work} \label{sauteed:sec:related} +The CA/B forum accepts certificates with \texttt{.onion} +addresses~\cite{cab-ballot144,cab-onion-dv}. DigiCert supports extended +validation of \texttt{.onion} addresses~\cite{digicert-onion}, and HARICA domain +validation~\cite{harica-onion}. Muffett proposed same-origin onion certificates +that permit clients to omit verification of the CA trust chain for +onionsites~\cite{sooc}. Sauteed onions help Tor users \emph{discover} +domain names with associated onion addresses. Therefore, it is a +complement to approaches that bring HTTPS to onionsites. + +Syverson suggested that traditional domain names and \texttt{.onion} addresses +can be glued into a single registered domain~\cite{once-and-future}. Nusenu +proposed long-term Tor relay identifiers based on domain names to retrieve lists +of relevant public keys via HTTPS~\cite{haroi-tor-dev}. Sauteed onions may be +used for such associations with the benefit of transparency, and it is further a +\emph{lighter} version of Syverson and Traudt's self-authenticated traditional +addresses which favors early deployment over properties like bidirectional onion +associations, guaranteed timeliness of revocation, and addressing all known +threats~\cite{onion-discovery-attacks,secdev19}. + +Winter~\emph{et al.} studied how users engage with onion services~\cite{winter}. +A gist is that Tor users have a hard time discovering onion addresses and +verifying them as authentic. Common discovery mechanisms that are associated +with human-meaningful identifiers include + personal communication, + webpage links, + onion-location redirects~\cite{onion-location}, + third-party lists~\cite{onion-service-overview}, and + search engines like DuckDuckGo. +Prior work has also focused on enumerating onion addresses without any +associated identity, e.g., through CT-logged certificates with \texttt{.onion} +addresses~\cite{muffet-onions} and crawling~\cite{ahmia.fi,nurmi}. +Sauteed onions enhance onion location by making the claimed associations +transparent in CT, and facilitate third-party solutions with less blind trust +and without assumptions about TLS sites not becoming blocked in the future. + +Several ideas were proposed that mitigate or bypass the problem of +random-looking onion addresses. Some sites generate vanity addresses that, +e.g., start with a prefix and have other memorable traits~\cite{vanity-address}. +Fink sketched out how to map onion addresses to a set of words~\cite{fink}. +Kadianakis~\emph{et al.} defined a common API to hook into alternative +naming systems that give onion addresses pet names~\cite{kadianakis}. +SecureDrop Onion Names is one such example that is, however, +implemented directly +in Tor Browser as an HTTPS Everywhere ruleset for selected news sites. Other +alternative naming systems include Namecoin~\cite{namecoin} and +OnioNS~\cite{onio-ns}. Sauteed onions is also an alternative naming system, but +one that relies on CAs and CT logs. +It may be possible to construct sauteed onions via DNSSEC, but then relying on +the DNS hierarchy without transparency logging. +Scaife~\emph{et~al.}~\cite{onion-dns} proposed the \texttt{.o} TLD as an +onionsite with DNSSEC. + +Nordberg connected transparency logs and the consensus mechanism that Tor +uses~\cite{nordberg-tor}. Dahlberg~\emph{et~al.} proposed CT in Tor for all +certificate validations~\cite{ctor-popets}. We only check signatures of +embedded SCTs in relation to onion location, and our search engine is +a simple application of CT monitoring. +There is a large body of orthogonal work that improve CAs and CT. For example, +multi-path domain-validation makes it harder to hijack onion +associations~\cite{le-multi-path}, and deployment of gossip would harden our +CT log assumptions~\cite{chuat-gossip,nordberg-gossip}. diff --git a/summary/src/sauteed/src/sauteed.tex b/summary/src/sauteed/src/sauteed.tex new file mode 100644 index 0000000..06a581c --- /dev/null +++ b/summary/src/sauteed/src/sauteed.tex @@ -0,0 +1,260 @@ +\section{Saut\'{e} Onions Until Discovery is Transparent and Confection is Firm} \label{sauteed:sec:trans} + +\subsection{System Goals} \label{sauteed:sec:system-goals} +Let an onion association be unidirectional from a traditional domain name to an +onion address. Three main system goals are as follows: + +\begin{description} + \item[Privacy-Preserving Onion Associations] Users should discover the same + onion associations, and otherwise the possibility of an + inconsistency must become public knowledge. + \item[Forward Censorship Resistance] Unavailability of a TLS + site must not impede discovery of past onion associations. + \item[Automated Verifiable Discovery] Onion association search should be + possible without requiring blind trust in third-parties. It must be hard to + fabricate non-empty answers, and easy to automate the setup for scalability + and robustness. +\end{description} + +For comparison, today's onion location~\cite{onion-location} does not assure a +user that the same HTTP header is set for them as for everyone else. Classes of +users that connect to a domain at different times or via different +links can be given targeted redirects to distinct onion addresses +without detection~\cite{onion-discovery-attacks}. Onion location also +does not work if a regular site becomes unavailable due to censorship. +The \emph{search engine approach} is further a frequent ask by Tor +users~\cite{winter}. The solutions that exist in practice rely on +manually curated +lists~\cite{muffet-onions,onion-service-overview,h-e-securedrop}, notably with +little or no retroactive accountability. As specified above, we aim for a +similar utility but with a setup that can be automated for all onion +associations and without the ability to easily fabricate non-empty answers +without trivial detection. We sketch out how these security properties are +achieved in Section~\ref{sauteed:sec:security-sketch}. + +\subsection{Threat Model and Scope} \label{sauteed:sec:threat-model} +We consider an attacker that wants to trick a user into visiting a targeted +onionsite without anyone noticing the possibility of such behavior. Users are +assumed to know the right traditional domain name that is easy to remember (such +as \texttt{torproject.org}), but not its corresponding onion address. We +further assume that the attacker either controls a trusted CA sufficiently to +issue certificates or is able to deceive them sufficiently during certificate +issuance to obtain a valid certificate +from that CA\@. Any misbehavior is however assumed to be detectable in CT. So, +the certificate ecosystem is treated as a \emph{building block} that we make no +attempt to improve. + +We permit the attacker to make TLS sites unavailable after setup, but +we assume it is difficult to censor the CT log ecosystem because it can +be mirrored by anyone. Also, as part of the Internet authentication +infrastructure, adversaries may have equities conflicts in blocking CT logs, +and if concerned at all about appearance would have a +harder time justifying such a block versus, e.g., a political, +journalism, or social media site. +Similar to CT, we do not attempt to solve certificate revocation and +especially not in relation to certificates that are connected to +discovery of onion associations. This is consistent with Tor Browser's existing +model for revocation with onion location, which similarly depends on the +certificate for the redirecting domain. There is no formal counterpart to revoke +a result in a search engine, but we outline future work related to this. + +Our threat model includes countries that block direct access to HTTPS +sites~\cite{russia-blocks}. +This is arguably a capable attacker, as no country is currently known to +completely block indirect access via the Tor network (though in some places +Tor bridges and/or obfuscated transport is needed). Our threat model also +considers the plethora of blindly trusted parties that help users discover onion +addresses with little or no retroactive +accountability~\cite{ahmia.fi,muffet-onions,onion-service-overview,h-e-securedrop}. +In other words, it is in-scope to pave the path towards more accountability. + +\subsection{Description of Sauteed Onions} \label{sauteed:sec:sauteed-onions} +An observation that inspired work on sauteed onions is that onion +location requires HTTPS~\cite{onion-location}. This means that +discovery of onion associations \emph{already} relies on the CA ecosystem. By +incorporating the use of CT, it is possible to add accountability to CAs and +other parties that help with onion address discovery while also raising the bar +for censoring sites and reducing anonymity. The name sauteed onions is a cooking pun; +the association of an onion address with a domain name becomes transparent for +everyone to see in CT logs. + +For background, a CA-issued certificate can contain both a traditional domain +name and a \texttt{.onion address}~\cite{cab-ballot144,cab-onion-dv}. This can +be viewed as a mutual association because the issuing CA must verify the +traditional domain name \emph{and} the specified onion address. An immediate +problem is that this would be ambiguous if there are multiple domain names; +which one (if any) should be associated with an onion address with such +certificate coalescence? A more appropriate path forward would therefore be to +define an X.509v3 extension for sauteed onions which clearly \emph{declares that +a domain-validated name wants to be associated with an onion address}. + +We describe two uses of sauteed onions that achieve our goals; first assuming it +is easy to get CA-issued certificates that contain associated onion addresses +for domain-validated names, and then a short-term roll-out approach that +could make it a reality now. A sauteed onion is simply a CT-logged certificate +that claims \texttt{example.com} wants to be associated with +\texttt{.onion} but not necessarily the other way around, i.e., a +unidirectional association. + +\subsubsection{Onion Location} \label{sauteed:sec:onion-location} +Figure~\ref{sauteed:fig:onion-location} illustrates onion location that uses +certificates. A user establishes a TLS connection to a site as usual. Upon +encountering a certificate that is CT-logged with an associated onion address +for the visited site \texttt{example.com}, an onion-location prompt becomes +available in Tor Browser or the onion site is visited automatically. This is the same type +of redirect behavior as today's onion location~\cite{onion-location}, except +that the possibility of such a redirect is disclosed in public CT logs. +Attempts at targeted redirects would thus be visible to site owners and +independent third-parties. A redirect to someone else's onion address would +also be visible to the respective site owners. Notably the ability to detect +inappropriate redirects acts as a deterrence while also being the first step +towards remediation, e.g., if users bookmarked onion addresses~\cite{winter} +to achieve trust on first use or to avoid visiting a regular site \emph{and} an +onionsite in a way that might reduce a user's anonymity set. + +\begin{figure}[!t] + \centering + \includegraphics[width=.6\columnwidth]{src/sauteed/img/onion-location} + \caption{Onion location based on a CT-logged certificate.} + \label{sauteed:fig:onion-location} +\end{figure} + +A key observation is that onion location has always been a feature +facilitated by TLS. By implementing it in certificates rather than HTTP +headers that are delivered via HTTPS connections, TLS applications that are ``not +web'' can use it too without rolling their own mechanisms. The addition of +requiring CT to follow onion-location redirects is also an improvement compared +to today, although one that could be achieved with an HTTP-based approach as +well (or more ambitiously, for all Tor Browser certificate +validations~\cite{ctor-popets}). + +We prototyped the above in a web extension that is free and open +source~\cite{sauteed-onion-artifacts}. The criterion for CT logging is at least +one embedded SCT from a log in the policy used by Google +Chrome~\cite{chrome-logs}. If an onion-location redirect is followed, the +path of the current webpage is preserved, similar to a typical configuration of +today's HTTP-based onion location header that instead lists a complete +URL~\cite{onion-location}. + +\subsubsection{Search Engine} \label{sauteed:sec:search-engine} +A significant challenge for third-parties that help users discover TLS sites +that are available as onion services is to gain confidence in the underlying +dataset at scale. For example, SecureDrop onion names are scoped to news +sites~\cite{h-e-securedrop}; the list by Muffett is scoped as ``no sites for tech +with less than (arbitrary) 10,000 users''~\cite{muffet-onions}; and +\texttt{ahmia.fi} does not even attempt to give onion addresses human-meaningful +names~\cite{nurmi}. To make matters worse, solutions based on manually curated +lists and third-party search are currently implemented with little or no +accountability. + +Figure~\ref{sauteed:fig:search-engine} shows what our approach brings to the table. +All CT logs can be monitored by a third-party to discover sauteed onions. +A search API can then be presented to users for the resulting dataset, similar +to existing monitoring services but scoped specifically for discovery of onion +associations. The utility of such a search API is: +``\emph{what onion addresses are available for \texttt{www.example.com}}''. + +\begin{figure}[!t] + \centering + \includegraphics[width=.6\columnwidth]{src/sauteed/img/onion-search} + \caption{Verifiable domain name to onion address search.} + \label{sauteed:fig:search-engine} +\end{figure} + +The expected behavior of the search API is that an answer can not be fabricated +without controlling a CA or hijacking certificate issuance, and any CA +malfeasance should further be caught by CT\@. This +means that no party can fabricate inappropriate answers without detection. +This is a major improvement compared to the alternative of no verifiability at +all, although one that in and of itself does not prevent \emph{false negatives}. +In other words, available answers could trivially be omitted. This is a +limitation with the authenticated data structure in CT that can be fixed; see +security sketch in Section~\ref{sauteed:sec:security-sketch} for an intuition of how to +work around it. + +We specified an HTTP REST API that facilitates search using a domain name; the +API also makes available additional information like the actual certificate and +its exact index in a CT log. In total there are two endpoints: \texttt{search} +(list of matches with identifiers to more info) and \texttt{get} (more info). The +complete API specification is available online together with our implementation, +which is free and open source~\cite{sauteed-onion-artifacts}. An independent +implementation from Tor's hack week is also available by Rhatto~\cite{rhatto}. +Our prototype runs against all CT logs in Google Chrome for certificates +logged after July 16, 2022. A few query examples are available in +Appendix~\ref{sauteed:app:search}. + +\subsubsection{Certificate Format} \label{sauteed:sec:cert-format} +Until now we assumed that a sauteed onion is easily set up, e.g., using an +X.509v3 extension. The bad news is that such an extension does not exist, and +it would likely be a long journey to standardize and see deployment by CAs. +Therefore, our prototypes rely on a backwards-compatible approach that encodes +onion addresses as subdomains~\cite{once-and-future}. To declare that +\texttt{example.com} wants to be associated with \texttt{.onion}, one can +request a domain-validated certificate that contains both \texttt{example.com} +and \texttt{onion.example.com}~\cite{secdev19}. The inclusion of +\texttt{example.com} ensures that such a setup does not result in a dangerous +label~\cite{dangerous-labels}. The \emph{hack to encode an onion address as a +subdomain} makes it part of the certificate without requiring changes to CAs. +Appendix~\ref{sauteed:app:setup} details the necessary setup-steps further. The gist +is the addition of a subdomain DNS record and using the \texttt{-d} option in +\texttt{certbot}~\cite{certbot}. + +Although the subdomain approach is easy to deploy right now, it is by +no means a perfect solution. An X.509v3 extension would not require +the configuration of an +additional DNS record. In other words, the unidirectional sauteed onions +property works just as well if the subdomain is not domain-validated. The +important part is that the CA validates \texttt{example.com}, and that the +associated onion address can be declared somewhere in the issued certificate +without an ambiguous intent. +Another imperfection that goes hand-in-hand with backwards-compatibility is that +CAs would have to \emph{opt-out} from sauteed onions, unlike site owners +that instead have to \emph{opt-in}. + +To avoid recommending a pattern that is discouraged by CAs, the Tor Project +should at least have a dialog with Let's Encrypt which issues the most +certificates~\cite{le}. Somewhat similar subdomain hacks related to CAs exist, +but then with explicit negotiations~\cite{plex}. +Subdomain hacks without a relation to CAs and TLS were discouraged in the +past~\cite{trans-laurie}. We argue that sauteed onions is related because +CA-validated names are at the heart of our approach. For example, this is +unlike Mozilla's binary transparency idea that just wanted to reuse a public +log~\cite{mozilla-bt}. Sauteed onions also do not result in more issued +certificates; it is just the number of domain-validated names that increase by +one for TLS sites that do the setup. + +\subsubsection{Security Sketch} \label{sauteed:sec:security-sketch} +Our threat model disallows the attacker to tamper with CT and to make the log +ecosystem unavailable. Onion location as described in +Section~\ref{sauteed:sec:onion-location} therefore ensures that a redirect becomes +public, achieving detectability as defined in our privacy-preserving onion +association goal. The search engine in Section~\ref{sauteed:sec:search-engine} +trivially achieves the same goal because onion associations are \emph{found} +via CT. Blocking a TLS site is additionally \emph{too late} if an association +is already in a CT log, thus achieving forward censorship resistance. +Our search engine approach further makes it hard to forge non-answers without +detection because it requires control of a CA and defeating the tamper-evidence +of CT logs. While it is possible to omit available answers, this can be +mitigated by having multiple search APIs, domains that check the integrity of +their own onion associations similar to the proposed verification pattern in +CONIKS~\cite{coniks}, or to represent the sauteed onion dataset as a sparse +Merkle tree to get a verifiable log-backed map that additionally supports +efficient non-membership proofs that CT lacks~\cite{smt,vds}. + +\subsection{Future Work} +It would be valuable to implement proofs of no omissions as well as native +lookups in a web extension or Tor Browser to verify everything before showing +the user a result (certificates, proofs of logging, etc). The entire or +selected parts of the sauteed onion dataset may further be delivered to Tor +Browser similar to SecureDrop onion names~\cite{h-e-securedrop}. The difference +would be that the list is automated using a selection criteria from CT logs +rather than doing it manually on a case-by-case basis. A major benefit is that +the sauteed onion dataset can then be queried locally, completely avoiding +third-party queries and visits to the regular site. Another approach to explore +is potential integration of the sauteed onion dataset into Tor's DHT: a +cryptographic source of truth for available onion associations is likely a +helpful starting point so that there is \emph{something to distribute}. It +would also be interesting to consider other search-engine policies than +\emph{show everything} as in our work, e.g., only first association or last +association. (These policies can be verified with \emph{full +audits}~\cite{vds}.) diff --git a/summary/src/tlwo/.gitignore b/summary/src/tlwo/.gitignore new file mode 100644 index 0000000..8bb88c8 --- /dev/null +++ b/summary/src/tlwo/.gitignore @@ -0,0 +1,9 @@ +main.pdf +*.blg +*.bbl +*.fls +*.fdb_latexmk +*.log +*.out +*.aux +*.swp diff --git a/summary/src/tlwo/img/.gitkeep b/summary/src/tlwo/img/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/summary/src/tlwo/img/.gitkeep @@ -0,0 +1 @@ + diff --git a/summary/src/tlwo/img/attack.pdf b/summary/src/tlwo/img/attack.pdf new file mode 100644 index 0000000..c99c22c Binary files /dev/null and b/summary/src/tlwo/img/attack.pdf differ diff --git a/summary/src/tlwo/img/cached.pdf b/summary/src/tlwo/img/cached.pdf new file mode 100644 index 0000000..c0a4524 Binary files /dev/null and b/summary/src/tlwo/img/cached.pdf differ diff --git a/summary/src/tlwo/img/plot_cache_entries-permissive.pdf b/summary/src/tlwo/img/plot_cache_entries-permissive.pdf new file mode 100644 index 0000000..2016a3f Binary files /dev/null and b/summary/src/tlwo/img/plot_cache_entries-permissive.pdf differ diff --git a/summary/src/tlwo/img/plot_cache_entries-web.pdf b/summary/src/tlwo/img/plot_cache_entries-web.pdf new file mode 100644 index 0000000..1373ed0 Binary files /dev/null and b/summary/src/tlwo/img/plot_cache_entries-web.pdf differ diff --git a/summary/src/tlwo/img/plot_cache_hits-permissive.pdf b/summary/src/tlwo/img/plot_cache_hits-permissive.pdf new file mode 100644 index 0000000..6a92fe9 Binary files /dev/null and b/summary/src/tlwo/img/plot_cache_hits-permissive.pdf differ diff --git a/summary/src/tlwo/img/plot_cache_hits-web.pdf b/summary/src/tlwo/img/plot_cache_hits-web.pdf new file mode 100644 index 0000000..f56588b Binary files /dev/null and b/summary/src/tlwo/img/plot_cache_hits-web.pdf differ diff --git a/summary/src/tlwo/img/plot_lookups-permissive.pdf b/summary/src/tlwo/img/plot_lookups-permissive.pdf new file mode 100644 index 0000000..172046d Binary files /dev/null and b/summary/src/tlwo/img/plot_lookups-permissive.pdf differ diff --git a/summary/src/tlwo/img/plot_lookups-web.pdf b/summary/src/tlwo/img/plot_lookups-web.pdf new file mode 100644 index 0000000..8936b14 Binary files /dev/null and b/summary/src/tlwo/img/plot_lookups-web.pdf differ diff --git a/summary/src/tlwo/img/plot_popularity_match-permissive.pdf b/summary/src/tlwo/img/plot_popularity_match-permissive.pdf new file mode 100644 index 0000000..ccd2d4c Binary files /dev/null and b/summary/src/tlwo/img/plot_popularity_match-permissive.pdf differ diff --git a/summary/src/tlwo/img/plot_popularity_match-web.pdf b/summary/src/tlwo/img/plot_popularity_match-web.pdf new file mode 100644 index 0000000..fc49a4b Binary files /dev/null and b/summary/src/tlwo/img/plot_popularity_match-web.pdf differ diff --git a/summary/src/tlwo/img/plot_preload_entries-permissive.pdf b/summary/src/tlwo/img/plot_preload_entries-permissive.pdf new file mode 100644 index 0000000..a08e43a Binary files /dev/null and b/summary/src/tlwo/img/plot_preload_entries-permissive.pdf differ diff --git a/summary/src/tlwo/img/plot_preload_entries-web.pdf b/summary/src/tlwo/img/plot_preload_entries-web.pdf new file mode 100644 index 0000000..e3f3ebf Binary files /dev/null and b/summary/src/tlwo/img/plot_preload_entries-web.pdf differ diff --git a/summary/src/tlwo/img/plot_preload_hits-permissive.pdf b/summary/src/tlwo/img/plot_preload_hits-permissive.pdf new file mode 100644 index 0000000..1f6cacc Binary files /dev/null and b/summary/src/tlwo/img/plot_preload_hits-permissive.pdf differ diff --git a/summary/src/tlwo/img/plot_preload_hits-web.pdf b/summary/src/tlwo/img/plot_preload_hits-web.pdf new file mode 100644 index 0000000..ce38004 Binary files /dev/null and b/summary/src/tlwo/img/plot_preload_hits-web.pdf differ diff --git a/summary/src/tlwo/img/plot_preload_lists-permissive.pdf b/summary/src/tlwo/img/plot_preload_lists-permissive.pdf new file mode 100644 index 0000000..9c79a77 Binary files /dev/null and b/summary/src/tlwo/img/plot_preload_lists-permissive.pdf differ diff --git a/summary/src/tlwo/img/plot_preload_lists-web.pdf b/summary/src/tlwo/img/plot_preload_lists-web.pdf new file mode 100644 index 0000000..a864f65 Binary files /dev/null and b/summary/src/tlwo/img/plot_preload_lists-web.pdf differ diff --git a/summary/src/tlwo/img/preload.pdf b/summary/src/tlwo/img/preload.pdf new file mode 100644 index 0000000..9f06a14 Binary files /dev/null and b/summary/src/tlwo/img/preload.pdf differ diff --git a/summary/src/tlwo/img/preload.svg b/summary/src/tlwo/img/preload.svg new file mode 100644 index 0000000..e507b66 --- /dev/null +++ b/summary/src/tlwo/img/preload.svg @@ -0,0 +1,1009 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Auxiliary Operation + Offline operation. + + + + Connector + Exit to or entry from another part of chart. + + + + + + + + + + + + + + + + + + + + + + Phase 3 - periodically resolveall domains in allowlist locally + Phase 2 - compile allowlistwith unqiue domain names + + + + + foo.org + cdn.foo.org + ads.foo.org + ... + + + central party + relay + + foo.org <IP> + bar.org <IP> + baz.org <IP> + cdn.foo.org <IP> + ads.foo.org <IP> + ... + ... + + Sharedpreload cache + Per-circuit cacheswithout any sharing + Circuit A + + Circuit B + + Circuit C + + Circuit D + + + + + + + + + + + + + + + + + + DNShierarchy + ... + ... + ... + ... + Store domains foundwhile loading foo.org,bar.org, baz.org, ..., fromseveral vantage points + Phase 1 - visit siteson a popularity list + + + foo.org + cdn.foo.org + ads.foo.org + ... + + + + + + + + + + + + + + + + + foo.org + + + + + + + + + + + + + + + foo.org + + + ... + + + + diff --git a/summary/src/tlwo/img/repeat-attack.pdf b/summary/src/tlwo/img/repeat-attack.pdf new file mode 100644 index 0000000..36e2f73 Binary files /dev/null and b/summary/src/tlwo/img/repeat-attack.pdf differ diff --git a/summary/src/tlwo/img/resolve.pdf b/summary/src/tlwo/img/resolve.pdf new file mode 100644 index 0000000..ff7ab6e Binary files /dev/null and b/summary/src/tlwo/img/resolve.pdf differ diff --git a/summary/src/tlwo/img/setting.pdf b/summary/src/tlwo/img/setting.pdf new file mode 100644 index 0000000..aee9012 Binary files /dev/null and b/summary/src/tlwo/img/setting.pdf differ diff --git a/summary/src/tlwo/img/uncached.pdf b/summary/src/tlwo/img/uncached.pdf new file mode 100644 index 0000000..2a83a17 Binary files /dev/null and b/summary/src/tlwo/img/uncached.pdf differ diff --git a/summary/src/tlwo/main.tex b/summary/src/tlwo/main.tex new file mode 100644 index 0000000..09110c4 --- /dev/null +++ b/summary/src/tlwo/main.tex @@ -0,0 +1,69 @@ +\begin{kaupaper}[ + author={% + \textbf{Rasmus Dahlberg} and + Tobias Pulls + }, + title={% + Timeless Timing Attacks and Preload Defenses in Tor's DNS Cache + }, + reference={% + USENIX Security (2023) + }, + summary={% + Tor relays cache resolved domains with constant time-to-live values not to + reveal information about past exit traffic while boosting performance. We + show that this caching strategy and its implementation in the live Tor + network can be exploited by a \emph{timeless timing attack} that leaks if a + domain is (not) cached. Further, the time that a domain was inserted into + the cache can be inferred by repeated probes. Our attack prototype's + experimental evaluation in real conditions shows that there are neither + false positives nor false negatives (10M~repetitions). Thus, it is useful + for instantiating a real-world website oracle without requiring any special attacker + capabilities or reach (just a modest computer that can create a Tor + circuit). One of our mitigations has been merged in Tor: probabilistic + time-to-live values that make the time-of-insertion fuzzy. Long-term, + Tor's DNS cache could be redesigned to \emph{preload} the same domains at all + exits. Such preloading would eliminate all (timeless) timing attacks in + Tor's DNS cache because the same domains would always be (un)cached across + different circuits. To retain performance within the same circuit, we + propose that the preloaded domains should be complemented by a dynamic + same-circuit cache that is not shared across circuits. Our + four-month-long DNS cache measurement at two 100~Mbit/s exit relays + informs on today's baseline performance. It is compared to a preloaded + DNS cache based on different variations of three popularity lists: Alexa, + Tranco, and Umbrella. A preloaded DNS cache can be as performant as today + with similar resource usage or significantly improve cache-hit ratios by + 2-3x. However, the increased cache-hit ratios have the cost of modest + increases in memory and resolver load. + }, + participation={\vspace{-.25cm} + Tobias and I collaborated closely from start to finish with the following + exceptions. I did most implementation work. Volunteers from DFRI---a + Swedish non-profit and non-partisan organization that promotes digital + rights---operated our exit relays. Tobias did most DNS cache data + analysis. Tobias also had the initial idea, which was refined with + feedback~from~Roger~Dingledine. + }, + label={ + paper:tlwo + }, +] + \maketitle + \begin{abstract} + \input{src/tlwo/src/abstract} + \end{abstract} + + \input{src/tlwo/src/introduction} + \input{src/tlwo/src/background} + \input{src/tlwo/src/tor-cache} + \input{src/tlwo/src/attack} + \input{src/tlwo/src/short} + \input{src/tlwo/src/long} + \input{src/tlwo/src/related} + \input{src/tlwo/src/conclusion} + \input{src/tlwo/src/acknowledgements} + \input{src/tlwo/src/availability} + + \bibliographystyle{plain} + \bibliography{src/tlwo/src/ref} +\end{kaupaper} diff --git a/summary/src/tlwo/src/abstract.tex b/summary/src/tlwo/src/abstract.tex new file mode 100644 index 0000000..df4fa1b --- /dev/null +++ b/summary/src/tlwo/src/abstract.tex @@ -0,0 +1,25 @@ +\noindent +We show that Tor's DNS cache is vulnerable to a timeless timing attack, allowing +anyone to determine if a domain is cached or not without any false positives. +The attack requires sending a single TLS record. It can be repeated to determine +when a domain is no longer cached to leak the insertion time. Our evaluation in +the Tor network shows no instances of cached domains being reported as uncached +and vice versa after 12M repetitions while only targeting our own domains. This +shifts DNS in Tor from an unreliable side-channel---using traditional timing +attacks with network jitter---to being perfectly reliable. We responsibly +disclosed the attack and suggested two short-term mitigations. + +As a long-term defense for the DNS cache in Tor against all types of (timeless) +timing attacks, we propose a redesign where only an allowlist of domains is +preloaded to always be cached across circuits. We compare the performance of a +preloaded DNS cache to Tor's current solution towards DNS by measuring +aggregated statistics for four months from two exits (after engaging with the +Tor Research Safety Board and our university ethical review process). The +evaluated preload lists are variants of the following top-lists: Alexa, Cisco +Umbrella, and Tranco. Our results show that four-months-old preload lists can be +tuned to offer comparable performance under similar resource usage or to +significantly improve shared cache-hit ratios (2--3x) with a modest increase in +memory usage and resolver load compared to a 100 Mbit/s exit. We conclude that +Tor's current DNS cache is mostly a privacy harm because the majority of cached +domains are unlikely to lead to cache hits but remain there to be probed by +attackers. diff --git a/summary/src/tlwo/src/acknowledgements.tex b/summary/src/tlwo/src/acknowledgements.tex new file mode 100644 index 0000000..84302c0 --- /dev/null +++ b/summary/src/tlwo/src/acknowledgements.tex @@ -0,0 +1,20 @@ +\section*{Acknowledgments} +Many thanks to + Georg Koppen and + Marc Juarez +for engaging with us in continuous Tor Research Safety Board discussions, as +well as + Elias Rudberg, + Johan Nilsson, and + Linus Nordberg +who operated our modified Tor relays at the premises of DFRI. +We would further like to thank our shepherd, the anonymous reviewers, + Mike Perry, + Nick Mathewson, + Paul Syverson, and + Roger Dingledine +for their valuable feedback. +The authors were supported by + Mullvad VPN, + the Swedish Foundation for Strategic Research, and + the Swedish Internet Foundation. diff --git a/summary/src/tlwo/src/attack.tex b/summary/src/tlwo/src/attack.tex new file mode 100644 index 0000000..542cdb3 --- /dev/null +++ b/summary/src/tlwo/src/attack.tex @@ -0,0 +1,247 @@ +\section{Timeless Timing Attack} \label{tlwo:sec:attack} + +Past work demonstrated timing attacks against Tor's DNS cache~\cite{wfwo}. In +short, anyone can observe the latency of a domain lookup to determine if it is +more or less likely that an answer is (not) cached. A quick response is more +likely to be cached, thereby leaking information about past traffic on an exit. +A downside of such a remote timing attack is that it is subject to network +jitter while traversing hops in the Tor network. We show how to bypass this +limitation by constructing a timeless timing attack that is immune to network +jitter~\cite{timeless}. Notably the attack only requires Internet access and a +very modest computer. + +Section~\ref{tlwo:sec:attack:detailed} outlines the attack, followed by a description +of our prototype implementation in Section~\ref{tlwo:sec:attack:prototype}, +evaluation in Section~\ref{tlwo:sec:attack:measurements}, as well as ethical +considerations in Section~\ref{tlwo:sec:attack:ethical}. + +\subsection{Detailed Description} \label{tlwo:sec:attack:detailed} +An exit's processing of an incoming RESOLVE cell depends on if an answer is +cached or not, see Figure~\ref{tlwo:fig:resolve}. An answer may already be available +and a RESOLVED cell can be scheduled for sending immediately (``cached''). +Otherwise an answer is not yet available and a resolve process needs to take +place concurrently to avoid blocking (``uncached''). We construct a timeless +timing attack by exploiting the fact that scheduling RESOLVED cells for sending +with different concurrent timings depend on if an answer is cached (send +immediately) or uncached (send based on an event later on)~\cite{ctor-1}. + +\begin{figure}[!t] + \centering + \includegraphics[width=.67\columnwidth]{src/tlwo/img/resolve} + \caption{% + Processing of an incoming RESOLVE cell at an exit relay. + Answers of concurrent resolves are triggered by events. + } + \label{tlwo:fig:resolve} +\end{figure} + +\subsubsection{Attack Outline} + +Suppose that we craft two RESOLVE cells for \texttt{example.com} and +\texttt{evil.com} such that they are processed by an exit \emph{directly after +each other without any events in between}. Further suppose that +\texttt{evil.com} is cached. The first RESOLVE cell is \texttt{example.com}. +The second RESOLVE cell is \texttt{evil.com}. Following from the flow in +Figure~\ref{tlwo:fig:resolve}, we can determine if \texttt{example.com} is (un)cached +by observing only the order in which the two RESOLVED cells come back. The +order will be switched if \texttt{example.com} needs concurrent resolving +because \emph{the answer is not available until after an event} (uncached). +Otherwise the order is preserved (cached). Sending two requests to be processed +at the same time and exploiting concurrency as well as differences in processing +time that affects the response order is what makes it +\emph{timeless}~\cite{timeless}. + +Figure~\ref{tlwo:fig:timeless} provides a detailed description on how to satisfy the +presumed setup. The attacker starts by looking up its own domain name for a +selected exit. This ensures that \texttt{evil.com} is cached. Next, two RESOLVE +cells are sent in the same TLS record from a hop proceeding the exit. Both cells +will be unpacked at the same time by TLS~\cite{ctor-2}, +% Note: input parameter at_most is set to -1 by tor's main loop +and when processing starts all available cells will be handled before giving +control back to Tor's main loop~\cite{ctor-3}. +Now recall that Tor is single-threaded. An event from any concurrent DNS +resolve can thus not be completed before all unpacked cells were fully +processed. This ensures that the order in which our two RESOLVED cells come +back in is guaranteed to leak if \texttt{example.com} is (un)cached as long as +both RESOLVE cells arrived together in-order and \texttt{evil.com} is really +cached. + +\begin{figure}[!t] + \centering + \subfloat[][uncached]{% + \includegraphics[width=.7\columnwidth]{src/tlwo/img/uncached} + \label{tlwo:fig:timeless:a} + }\\ + \subfloat[][cached]{% + \includegraphics[width=.7\columnwidth]{src/tlwo/img/cached} + \label{tlwo:fig:timeless:b} + } + \caption{% + The attacker ensures a domain \texttt{evil.com} is cached. Next, two + RESOLVE cells are sent to arrive at the same time in-order. The relay + processes both cells before triggering any resolve event. This means + that answers can only be sent directly if no resolving is needed. The + order of RESOLVED cells switch if \texttt{example.com} is uncached. + Otherwise the order is preserved. } + \label{tlwo:fig:timeless} +\end{figure} + +It should be noted that an attacker can gain full control of how their TLS +records are packed to exits by either running a modified Tor relay or creating +one-hop circuits. In practise, it is also possible to omit the step of caching +\texttt{evil.com} and instead send a \texttt{RESOLVE} cell containing an IP +address. Tor will simply echo the IP as if it was cached~\cite{ctor-4}. We +describe the attack without this optimization because it is more general. + +\subsubsection{Repeated Attack to Learn Insertion Time} +So far we described how to determine if a domain is (un)cached at an exit. +Figure~\ref{tlwo:fig:attack-repeated} shows how to derive the exact time that a +domain was added to an exit's DNS cache. First determine whether the domain's +TTL will be clipped to 300 or 3,600 seconds by observing the TTL returned from +the authoritative name server or the configured resolvers of the +exit~\cite{GreschbachPRWF17}. Then repeat the timeless timing attack +periodically until the domain is no longer cached, say, once per second. Suppose +the expected clip is 300 seconds and the attack started at time $t$. If it +takes $x < 300$ seconds for the entry to become uncached, it was added to the +exit's DNS cache at time $t+x - 300\textrm{s}$. Observing $x > 300$ seconds +means that a different party inserted the entry into the cache between probes +(may happen for some of the most frequently looked-up domains, depending on +probing frequency). To recover, the attacker can perform the same steps again +until they succeed. For example, with two tries the initial insertion happened +at $t+x - 600\textrm{s}$. Notably these estimates cannot be more precise than +the attacker's repetition interval. + +\begin{figure}[!t] + \centering + \includegraphics[width=.53\columnwidth]{src/tlwo/img/repeat-attack} + \caption{% + Repeated timeless timing attack to infer the exact time that a domain + was cached by someone at an exit relay. For example, if the expected + clip is 300s ($\mathsf{ttl}\le300$s), the attack is repeated every + second, and the observed $x$ is 40s, then caching of + \texttt{example.com} happened at time $\approx t-260$s. } + \label{tlwo:fig:attack-repeated} +\end{figure} + +\subsubsection{Discussion} +While an attacker can determine if a domain is cached by an exit and if so the +exact time it was added, the attacker cannot determine the number or timings of +lookups for a domain after entering the cache. In isolation, the attacker also +cannot determine which identifiable user cached a given domain. + +It is easy to conduct the attack in parallel because +probing for the status of \texttt{foo.org} is completely independent from +\texttt{bar.org} at the same relay as well as other probes on different relays. +In other words, an attacker can probe a single domain on all exits +simultaneously, many different domains at a single exit, or both. Network-wide +probes for the same domain may be detectable by observing the DNS caches of +multiple relays and correlating their contents. However, note that a risk-averse +attacker~\cite{AumannL07} may spread their probes over time (five or sixty +minutes) and domains (expected twelve domains per website on Alexa top-1M +websites~\cite{GreschbachPRWF17}), if the goal is to confirm a website visit. + +An example use-case for a parallel attack is answering network-wide queries, for +example, ``is \texttt{foo.org} visited more frequently than \texttt{bar.org}, or +did any Tor user visit \texttt{baz.org} at a particular point in time?'' The +latter is an instantiation of a so-called website oracle~\cite{wfwo}. Website +oracles remove virtually all false positives in WF attacks for all but the most +popular websites on the web, and WF attacks may connect identifiable users with +visited websites. See Figure~\ref{tlwo:fig:setting} in +Section~\ref{tlwo:sec:introduction} for an overview of this attack setting. + +\subsection{Prototype Implementation} \label{tlwo:sec:attack:prototype} + +We prototyped our timeless timing attack so that it runs for a given exit and a +list of domains. Figure~\ref{tlwo:fig:attack} shows the overall setup which consists +of \texttt{carml}, +\texttt{tor-resolve}, a locally patched Tor process, and a Python script +automating the entire setup. First Tor is started, a \emph{one-hop circuit} is +built to the selected exit, and all streams are attached to it using +\texttt{carml}. Next, \texttt{tor-resolve} is used to send a special lookup +query for \texttt{example.com} by simply appending a magic string +\texttt{---sc}. The patched Tor process splits such requests into two RESOLVE +cells in the same TLS record: one for the specified domain, and another one that +is guaranteed to not need any concurrent resolving. Finally Tor sets the output +to \texttt{0.0.0.0} if the resulting RESOLVED cells switched order, otherwise +\texttt{1.1.1.1} (arbitrary constants). +After processing all domains Tor is closed and the output +is a list where each item is zero (uncached), one (cached), or negative +(unknown, e.g., due to a resolve timeout, a stream attach failure, or a vanished +circuit). +The complete attack required less than 100 lines of C to patch Tor, as well as +200 lines of Python to make it fully automated. + +\begin{figure}[!t] + \centering + \includegraphics[width=.53\columnwidth]{src/tlwo/img/attack} + \caption{% + Local attack setup consisting of \texttt{carml} to build one-hop + circuits, \texttt{tor-resolve} to inject queries, and a patched + tor process that transforms them into timeless timing attacks. + } + \label{tlwo:fig:attack} +\end{figure} + +\subsection{Network Measurements} \label{tlwo:sec:attack:measurements} + +We conducted measurements in the live Tor network to evaluate the reliability of +our prototype with four parallel instances of +the setup in Figure~\ref{tlwo:fig:attack} on a system with an Intel(R) Xeon(R) CPU +E5-2630 @ 2.30GHz and 4GB of DRAM. All targeted domains were our own, see +ethical considerations in Section~\ref{tlwo:sec:attack:ethical}. In total there were +$14,446$ runs between May 17--26, 2022. Each run used an exit that was sampled +uniformly at random. Assuming $1,000$ exits at all times (conservative), the +individual select probability should not exceed $0.004$ per run. Each run +performed up to $1,000$ timeless timing attacks, chunked into $500$ attacks per +circuit and alternating between uncached and cached lookups by specifying a +unique domain twice in a row: \texttt{...example.com}. The maximum runtime was set to ten minutes. Each query also +had a ten second timeout. In the advent of errors like circuit failure or +timeouts, the remainder of the run was canceled but all results up until that +point were collected. The average number of DNS requests leaving the Tor +network from \emph{all four combined instances} was $8.6$ per second. The +effective queries per second was slightly higher due to brief pauses while +setting up a new run. For reference, Sonntag reported in 2018 that the DNS +resolver of an exit with $200$Mbit/s received an average and maximum of $18$ and +$81$ requests per second~\cite{sonntag-metrics}. Earlier, +Figure~\ref{tlwo:fig:lookups} also showed significant variability in lookups. +Handling our per-exit overhead during a couple of minutes should thus be +insignificant when compared to regular patterns for DNS traffic in the network. + +Table~\ref{tlwo:tab:attack} summarizes our results. After 12M timeless timing +attacks, there were no cases of uncached lookups being reported as cached and +vice versa. This is consistent with the description in +Section~\ref{tlwo:sec:attack:detailed}: neither false positives nor false negatives +are expected. The observed probability to not get an answer due to +detectable failures were $0.00025$. + +\begin{table}[!t] + \centering + \caption{% + Timeless timing attack results. Neither false negatives nor + false positives were observed with 6M repetitions each. + } + \begin{tabular}{c|ccc} + Type & Got uncached & Got cached & Failures \\ + \hline + Uncached & $6,034,779$ & $0$ & $2,858$ \\ + Cached & $0$ & $6,034,594$ & $142$ \\ + \end{tabular} + \label{tlwo:tab:attack} +\end{table} + +\subsection{Ethical Considerations} \label{tlwo:sec:attack:ethical} + +We responsibly disclosed our attack to the Tor Project through their security +list. The submitted disclosure included a theoretical attack description, a +prototype implementation with measurements showing how reliable it was, as well +as a sketch of short-term and long-term defenses. As part of our dialog, we also +coordinated with the Tor Project on submitting this paper to USENIX Security to +get peer review. + +The conducted network measurements targeted domains under our own control. This +ensured that we did not learn anything about real Tor users. Performance +overhead on exits and the Tor network at large was also modest, see +Section~\ref{tlwo:sec:attack:measurements}. In other words, the downsides were +negligible while the significance of evaluating \emph{real-world reliability} +was helpful to inform and motivate the need for mitigations and defenses. diff --git a/summary/src/tlwo/src/availability.tex b/summary/src/tlwo/src/availability.tex new file mode 100644 index 0000000..cff4dbc --- /dev/null +++ b/summary/src/tlwo/src/availability.tex @@ -0,0 +1,19 @@ +\section*{Availability} \label{tlwo:sec:availability} +We make the following three artifacts available: + +\begin{enumerate} + \item Patches to Tor, associated scripts and data, and documentation for + performing timeless timing attacks. + \item The measurement data from our two exits, a detailed timeline of + operations, scripts for creating extended preload lists, and associated + Python scripts for parsing all stats and generating figures. + Sharing of the dataset was discussed as part of the contact with the Tor + Research Safety Board and our university ethical review process. Relevant + parts of our research safety board contact are included in our artifact. + \item Contributions to the Tor Project, including source code and associated + tooling for our Fuzzy TTLs mitigation and preload defense. +\end{enumerate} + +See +\url{https://gitlab.torproject.org/rgdd/ttapd} +to locate the above. diff --git a/summary/src/tlwo/src/background.tex b/summary/src/tlwo/src/background.tex new file mode 100644 index 0000000..719dec6 --- /dev/null +++ b/summary/src/tlwo/src/background.tex @@ -0,0 +1,73 @@ +\section{Background} \label{tlwo:sec:background} +The remainder of the paper requires preliminaries about DNS +(Section~\ref{tlwo:sec:background:dns}), in particular in relation to Tor +(Section~\ref{tlwo:sec:background:tor}). + +\subsection{DNS} \label{tlwo:sec:background:dns} DNS is a hierarchical system that +maps domain names (``domains'') to IP addresses. The hierarchy is composed of +root servers, top-level domain (TLD) servers, and authoritative name servers. +Root servers are aware of TLD servers like \texttt{.com}. TLD servers are aware +of authoritative name servers in their zone like \texttt{example.com}. +Authoritative name servers are aware of the actual answers to a domain lookup. +A domain lookup for \texttt{example.com} involves asking the root server for the +TLD server of \texttt{.com}; the TLD server for the authoritative name server of +\texttt{example.com}; and finally the authoritative name server for the IP +address of \texttt{example.com}. The resolve process is typically performed +iteratively in plaintext over UDP by a third-party resolver that caches +responses, e.g., to improve performance. The default is usually to rely on ISP +DNS resolvers. It is also possible to configure other ones, e.g., Google's +\texttt{8.8.8.8} or self-hosted using \texttt{unbound}, \texttt{bind}, etc. + +Of note is that the resolved domains are associated with a Time To Live (TTL) +value. As the name suggest, it is the amount of time that a resolved domain +should be considered fresh. TTL values are sometimes overridden in caches to +improve reliability~\cite{rfc8767,MouraHMSD18} or preserve +privacy~\cite{GreschbachPRWF17}. + +\subsection{Tor} \label{tlwo:sec:background:tor} +The Tor network is composed of thousands of relays that route encrypted traffic +on behalf of millions of daily users~\cite{tor,ManiWJJS18}. Ordinary uses of +Tor include preserving privacy, safety and freedom as well as facilitating +dissent and circumventing censorship~\cite{tpo-russia,tpo-who-uses-tor}. Access +to the Tor network is easy using Tor Browser (TB), which is configured to proxy +all traffic through a local Tor process that takes care of routing. TB adds +many other protections that are orthogonal to our work~\cite{tb}. + +During a regular website visit a circuit is built through a guard, middle, and +exit relay. The first relay is fixed in a small guard set that rarely changes +once selected, while the middle and exit relays are randomly selected weighted +by bandwidth for each new circuit. A circuit may have many streams (analogous +to TCP/IP connections), typically corresponding to separate flows for a +particular destination. Control traffic and data is transported through the +network in fixed-size cells that are encrypted in layers. At each hop in a +circuit, one layer of encryption is peeled-off. Outstanding cells from relay A +to relay B are sent in a shared channel that is TLS protected. Public keys, +relay identities, and more are discovered in Tor's consensus, which is secure if +a threshold of trusted directory authorities act honestly. + +We are particularly interested in how Tor interacts with DNS. To look up a +domain, the user's Tor process may send a RESOLVE cell that requests resolution +by the exit. Some exits are configured with their own iterative resolvers, +while others rely on DNS from their ISP or other +third-parties~\cite{GreschbachPRWF17}. The answer to a lookup is stored in the +exit's cache, but with the TTL \emph{clipped} to 300 or 3600 seconds depending +on if the TTL is $\le 300$ seconds or not. A RESOLVED cell is then sent to the +user, who only gets to see the clipped TTL regardless of how long it has been +stored in the cache to avoid leaking information about past exit traffic (like +the insertion time which would be trivial to infer from a counted-down TTL). If +too many entries are added to Tor's DNS cache and memory becomes a scarce +resource, an Out-Of-Memory (OOM) job deletes domains until freeing enough +memory. This is all controlled by an event-driven single-threaded main loop. + +Of further note is that TB is based on Firefox. As part of connecting to a +website, DNS is handled transparently through a SOCKS proxy provided by the +local Tor process. Requests to connect to a domain through the SOCKS proxy +results in the user's Tor process sending a BEGIN cell to establish a connection +to the destination, which in turn triggers domain resolution at the exit. In +other words, there are two ways to look up domains: RESOLVE cells and BEGIN +cells. At no point is any resolved IP address cached in TB or in the user's Tor +process. This prevents shared state (the cache) from being used to +fingerprint a user's activity across different circuits. + +We continue our introduction to Tor's DNS cache next while describing the +first measurement of its performance. diff --git a/summary/src/tlwo/src/conclusion.tex b/summary/src/tlwo/src/conclusion.tex new file mode 100644 index 0000000..485c593 --- /dev/null +++ b/summary/src/tlwo/src/conclusion.tex @@ -0,0 +1,52 @@ +\section{Conclusion} \label{tlwo:sec:conclusion} +Our timeless timing attack on Tor's DNS cache is virtually perfect, +significantly improving over earlier timing attacks~\cite{wfwo}. Based on 12 +million measurements in the live Tor network, we only observed a 0.00025 failure +rate due to vanished circuits and other transient networking errors that are +easy to account for. We responsibly disclosed the attack to the Tor Project +and coordinated the process around defenses with them. + +Our proposed mitigations are just that---mitigations---and do not completely +address the underlying issues. The fuzzy TTLs mitigation primarily addresses +confirmation with WF attacks involving moderately popular domains. Cover +lookups, while valuable if done, does not scale and requires continuous efforts +that are not easily automated on a large scale. + +Setting out to find long-term solutions, we landed in redesigning Tor's DNS +cache completely with a preload design. To inform the design and to evaluate its +feasibility, we ran a four-month experiment starting in May 2022 measuring key +performance metrics. To ensure that our measurements were safe, we repeatedly +consulted the Tor Research Safety Board and completed our university ethical +review process. We received positive feedback as well as invaluable suggestions +along the way to minimize any potential harm to the Tor network and its users. + +First, the preload design is immune to timing and timeless attacks due to never +sharing any data in the DNS cache injected due to user activity across circuits. +Secondly, the preload lists of domains based on extended Alexa, extended Tranco, +and Cisco Umbrella all show impressive cache-hit ratios. Depending on list, it +is possible to get comparable cache-hit ratios, memory usage, and resolver load +as Tor today. More extensive lists can trade modest increases in memory and +resolver load with significantly higher cache-hit ratios, especially for web +traffic. Important future work is improving how the extended lists are +generated---e.g., by tailoring them specifically for relays in certain regions +(location sensitivity), excluding unique tracking domains, or crawling websites +to discover subdomains---which is likely to lead to higher cache-hit ratios and +smaller lists. + +One of the biggest downsides of the preload design is that the most effective +preload lists are extended lists based on Alexa or Tranco, requiring continuous +efforts to update. Fortunately, our measurements show that even four-month-old +extended lists remain effective with significant improvement over baseline Tor. +It is likely feasible for the Tor Project to generate and ship hard-coded +preload lists as part of regular Tor releases and still improve performance +compared to today. + +Like Mani \emph{et~al.}~\cite{ManiWJJS18}, we see that traffic in the Tor +network appears to reasonably match website/domain popularity lists like Alexa, +Tranco, and Umbrella. This is fundamental for the preload design, and likely +also a contributing factor for the observed long stability of the extended +preload lists, since the most popular sites see relatively little +churn~\cite{PochatGJ19}. Finally, our measurements indicate that the Tor network +has grown by about 300\% in terms of number of streams since 2018, and that the +large majority of Tor's current DNS caching is a privacy harm rather than a +cross-circuit performance boost. diff --git a/summary/src/tlwo/src/introduction.tex b/summary/src/tlwo/src/introduction.tex new file mode 100644 index 0000000..04dd6bc --- /dev/null +++ b/summary/src/tlwo/src/introduction.tex @@ -0,0 +1,151 @@ +\section{Introduction} \label{tlwo:sec:introduction} +Tor~\cite{tor} is a volunteer-operated anonymity network composed of relays that +route encrypted traffic with low latency. +One of Tor's trade-offs is to not provide anonymity against a global passive +attacker that observes traffic as it enters and leaves the +network~\cite{trilemma,tor}. +A typical attacker setting is therefore to only observe encrypted traffic as it +enters the network from an identifiable user, forcing traffic analysis of the +encrypted packets to classify the user's behavior. An attacker that tries to +classify visited websites is said to perform Website Fingerprinting +(WF)~\cite{cheng1998traffic,HerrmannWF09,Hintz02,DBLP:conf/ccs/LiberatoreL06,PanchenkoNZE11,DBLP:conf/sp/SunSWRPQ02}. +Many questions about the practicality of WF attacks have been raised, ranging +from how to keep a trained dataset updated to managing false +positives~\cite{onlinewf,JuarezAADG14,perryCrit,realistic}. False positives in +WF may be ruled out using side-channels~\cite{JuarezAADG14,wfwo}. For example, +an attacker with access to (traffic to~\cite{SibyJDVT20}) Google's public DNS +resolver can use it to confirm if a website visit really happened over +Tor~\cite{GreschbachPRWF17}. + +Side-channels that leak information about exiting traffic are in fact +many~\cite{wfwo}. For example, during the course of a website visit there may +be interactions with DNS resolvers, OCSP responders, real-time bidding +platforms, and CDNs. An attacker that is able to query or gain access to the +resulting datasets learns partial information about destination traffic, notably +without ever observing any of the exiting TCP flows typically associated with +correlation attacks on Tor~\cite{JohnsonWJSS13,deepcorr}. Depending on the ease +of accessibility (e.g., does it require Google reach), reliability (e.g., are +there any false positives), and coverage (e.g., is it only applicable for a +small fraction of exit traffic), the impact of a given side-channel will be more +or less urgent to address with mitigations and/or defenses~\cite{tor}. + +\subsection{Timeless Timing Attacks in Tor's DNS} +Timing attacks exploit that an operation takes more or less time to execute +depending on something secret. The attacker's goal is to infer the secret +information by merely observing the non-constant execution times, e.g., to +recover a private key~\cite{timing-attack}, decrypt a ciphertext~\cite{lucky13}, +or check if a domain is cached by a Tor exit~\cite{wfwo}. A remote timing +attack takes place over a network. Repeated measurements and statistics are +usually required to account for network jitter, which adds noise to the observed +timings~\cite{remote-timing-attacks}. Van Goethem~\emph{et~al.}~\cite{timeless} +proposed a technique that eliminates all network jitter in remote attacks. It is applicable if +two requests can be sent to arrive at the same time, request processing is +concurrent, and the order in which responses are returned reflects differences in +execution time. + +We find that Tor's DNS cache at exits fulfills all three criteria of a timeless +timing attack, allowing anyone to determine if a domain is cached or not by +sending a single TLS record. The attack is reliable (neither false positives +nor negatives), confirmed by using our prototype to make 12M network +measurements against our own domains. The attack is also repeatable, making the +exact time that a domain was inserted into the cache inferable due to +determinism in Tor's TTL logic. + +Figure~\ref{tlwo:fig:setting} provides a summary of how the ability to infer whether +domains are (un)cached at exits make WF attacks more practical. +The attacker observes encrypted traffic from a client +to a guard relay at time $t$, classifying the network trace as associated with +\texttt{foo.org}. The attacker then conducts timeless timing attacks against +all exits in the Tor network to determine if \texttt{foo.org} was really visited +by \emph{someone} at time $t$. If the answer is yes the classification is +accepted, otherwise it is rejected. Prior work by Pulls and Dahlberg show that +the capability to determine whether a website was visited from Tor at time $t$ +removes virtually all false positives in WF attacks for all but the most popular +websites on the web~\cite{wfwo}. We provide further evidence that this is a +realistic capability to assume by demonstrating that \emph{any attacker with an +Internet connection could have used it in attacks for the last two decades}. +While it is a powerful capability to eliminate false positives, the overall +success in linking users with their website visits also depends on the WF +attack~\cite{onlinewf,JuarezAADG14,perryCrit,realistic}. + +\begin{figure}[!t] + \centering + \includegraphics[width=0.67\columnwidth]{src/tlwo/img/setting} + \caption{% + WF with an attacker that rules out false positives by checking + that the expected DNS records were cached at the right time by + conducting timeless timing attacks against exits. + } + \label{tlwo:fig:setting} +\end{figure} + +\subsection{Preload Defenses and Measurements} +Patching Tor's DNS cache to resist (timeless) timing attacks is challenging +without hurting performance. For example, making all DNS lookups constant time +would defeat the purpose of having a cache. The idea of our long-term defense +is to remove harmful cross-circuit caching that is unlikely to boost performance +while still safely caching useful domains. The Tor-network measurements of Mani +\emph{et~al.}~\cite{ManiWJJS18} tell us that web-traffic from the Tor network +matches that of the rest of the Internet, following popularity lists like +Alexa~\cite{alexa}. What should boost cross-circuit performance is the upper +parts of a representative popularity list; not the long tail of infrequently +visited sites. This is the intuition of our defense. Preload a list of popular +domains that are cached and continuously refreshed by all exits. A domain name +is either always cached as part of the preload list or not shared across +circuits at all. + +We conduct four months of measurements in the live Tor network to evaluate 400 +popularity lists derived from Alexa~\cite{alexa}, Cisco +Umbrella~\cite{umbrella}, and Tranco~\cite{tranco}. To put our results into +perspective, we also measure a baseline of Tor's current DNS cache performance. +The measurement method is to collect aggregated counters every 15 minutes, e.g., +the number of lookups cache-hits, and memory overhead, from two 100 Mbit/s +relays with web and permissive exit port policies. + +Tor's mean \emph{cross-circuit} cache-hit ratio is currently 11\% (web) and 17\% +(permissive). Variants of Alexa/Tranco top-200 (web) and Alexa/Tranco top-700 +(permissive) achieve the same cross-circuit cache-hit ratios. A preload list +from the top-10k can achieve 2--3 times higher cross-circuit cache-hit ratios at +the cost of at most 60 MiB memory and some increased resolver load (manageable +in part due to RFC 8767~\cite{rfc8767}). Throughout the entire measurement we +noted only a slight decline in effectiveness while using stale preload lists +(i.e., when using four-month-old lists at the end). This adds to the feasibility +of using preload lists, as in practice someone has to assemble and deliver them +to all exits in the Tor network. + +\subsection{Contributions and Outline} +Our contributions are as follows: + +\begin{itemize} + \item Performance measurements of the DNS cache in Tor over four months from + two exits, showing an average 80--83\% cache-hit ratio with approximately + 10,000 entries in the cache; around 11--17\% of the observed cache hits are + due to the cache being shared across circuits, and the number of lookups + appears weakly correlated with exit probability + (Section~\ref{tlwo:sec:torcache}). + \item Demonstration of a timeless timing attack that probes for cached + domains in Tor's DNS cache without any false positives or false + negatives after 12M repetitions against our own domain in the Tor + network (Section~\ref{tlwo:sec:attack}). + \item Mitigations based on fuzzy TTLs and cover lookups that add some + limited short-term protections (Section~\ref{tlwo:sec:short}). + \item A long-term redesign of Tor's DNS cache that defends against + (timeless) timing attacks. Cache-hit ratios can be tuned to offer + comparable performance under similar resource usage as today or to + significantly improve shared cache-hit ratios (2--3x) with a modest + increase in memory usage and resolver load, notably invariant to + exit probability as preload lists are fixed (Section~\ref{tlwo:sec:long}). +\end{itemize} + +Section~\ref{tlwo:sec:background} provides necessary background on DNS and Tor, +Section~\ref{tlwo:sec:related} discusses related work, and +Section~\ref{tlwo:sec:conclusion} offers conclusions, followed by the availability of +our research artifacts. + +We would like to highlight that Sections~\ref{tlwo:sec:torcache:ethics}, +\ref{tlwo:sec:attack:ethical}, and \ref{tlwo:sec:long:preload:ethics} describe ethical and +safety precautions to ensure that no users were harmed by our research and to +maximize its positive impact. We responsibly disclosed our timeless timing +attack to the Tor Project and engaged with the Tor Research Safety Board as well +as our university's ethical review process as part of performing network +measurements to inform our defenses. diff --git a/summary/src/tlwo/src/long.tex b/summary/src/tlwo/src/long.tex new file mode 100644 index 0000000..16331ac --- /dev/null +++ b/summary/src/tlwo/src/long.tex @@ -0,0 +1,473 @@ +\section{Redesigning Tor's DNS Cache} \label{tlwo:sec:long} +To address (timeless) timing attacks in Tor's DNS cache we considered a +number of possible smaller changes. All of them failed for different reasons, +however. Section~\ref{tlwo:sec:long:strawman} presents a straw-man design that is +helpful to understand \emph{why}, while at the same time being closely related +to the properties achieved by the preload DNS cache design in +Section~\ref{tlwo:sec:long:preload}. Section~\ref{tlwo:sec:long:evaluation} presents an +extensive evaluation that answers questions about how feasible and performant +our proposal is. + +\subsection{Straw-man Design} \label{tlwo:sec:long:strawman} +We omit all but one +straw-man design that is particularly important to understand the proposed +redesign in Section~\ref{tlwo:sec:long:preload}: \emph{simply remove Tor's DNS +cache}. If there is no DNS cache to speak of in Tor, it is easy to see that +there cannot be any (timeless) timing attacks against Tor's DNS cache (because +it does not exist). What these attacks would instead target is the exit's DNS +resolver which also has a cache. At a first glance it may seem like an +insignificant improvement that just moves the problem elsewhere. This would be +the case if every exit used its own dedicated DNS resolver. However, an exit may +share a resolver with other exits or most importantly clients outside of the Tor +network. A prime example is the resolver of the ISP of the exit. Any inference +made from the state of shared resolvers would thus not be directly attributable +to activity on the Tor network. This would therefore make \emph{false positives} +a reality with regards to if a domain was cached or not as a consequence of +activity in the Tor network. + +Introducing false positives to the timeless timing attack itself is in general +challenging because an answer needs to be available at the same time regardless +of there being a cache hit or miss. False negatives may seem easier and could +make the attacker discard otherwise correct classifications, e.g., because an +attack only works half of the time. However, without false positives, attackers +are still able to reliably remove otherwise incorrect classification through +confirmation~\cite{wfwo}. Because websites typically make use of multiple domain +names, defenses that add random delays to responses (to cause false negatives) +would need to consistently add similar delays for all relevant domains tied to +websites or other user activity the attacker is trying to infer. The semantics +surrounding user activity is hard if not impossible to capture at the DNS level. +Therefore, all false negative defenses we could think of failed. + +Now suppose that Tor has no DNS cache and exits always use a shared resolver +that may introduce false positives. +A major downside is that performance would take a significant hit due to the +lack of a cache in Tor, especially since a shared resolver is likely not running +locally, but provided by the ISP or some third-party. It is likely that both +page-load latency and resolver load would increase. Worsening performance and +especially latency is the opposite of what the Tor project is working +towards~\cite{tor,tor-congestion}. Next we show how to get the good properties +of not having a DNS cache in Tor (potential for false positives) while improving +performance. + +\subsection{The Preload DNS Cache} \label{tlwo:sec:long:preload} +This is not only a defense against (timeless) timing attacks in the DNS cache, +but a complete redesign of Tor's DNS cache. Ultimately, \emph{what we want to +achieve is false positives for an attacker trying to determine client activity +in the Tor network with the help of DNS}. The only way to achieve this---upon +learning that a domain associated with some activity has been looked up---is if +there is a possibility that this domain lookup was caused from outside of the +Tor network. Therefore, as a starting point, we assume that the Tor Project +would strongly encourage exit operators to not run local resolvers dedicated to +exits. Instead, exit operators should configure their systems to use their ISP +resolvers or use a third-party provider. Greschbach +\emph{et~al.}~\cite{GreschbachPRWF17} investigated the effect of DNS on Tor's +anonymity, including resolver configuration, and found that using the ISP's +resolver would be preferable. + +First remove all of Tor's current DNS caching as in our straw-man design. The +preloaded DNS cache instead contains two types of caches: a same-circuit cache +and a shared preload cache, see Figure~\ref{tlwo:fig:preload}. The preloaded cache +only contains domains from an allowlist. This allowlist is compiled by a +central party (e.g., by the Network Health team in the Tor Project) by visiting +popular sites from several different vantage points. The allowed domains are +then delivered to exits and continuously resolved to IPs by each exit. During +domain resolution on a circuit, the client's lookup first hits the preload +cache. If the domain is preloaded, a cache hit is guaranteed regardless of if +anyone performed a lookup before. Therefore, it is safe to share this cache +across circuits without leaking information about past exit traffic. On a cache +miss, the circuit's same-circuit cache is consulted. As the name suggests, this +cache is shared for streams on the same circuit but not across different +circuits. Due to Tor's circuit isolation, an attacker is unable to probe any +other cache than their own. Therefore, (timeless) timing attacks are eliminated +(similar to if Tor did not have a DNS cache at all), but without removing the +possibility of cache hits. + +\begin{figure}[!t] + \centering + \includegraphics[width=.85\columnwidth]{src/tlwo/img/preload} + \caption{% + Overview of the preloaded DNS cache design. A central party + visits sites on a popularity list from different vantage points + to compile an allowlist of domains that each relay keeps + preloaded at all times by resolving them continuously. DNS + looks-ups start in the shared preload cache and moves on to a + dynamic cache that is never shared across circuits on cache + misses. + } + \label{tlwo:fig:preload} +\end{figure} + +Including a same-circuit cache in the defense is motivated by Tor's significant +same-circuit caching to retain performance, see +Figures~\ref{tlwo:fig:metrics:hitsweb} and~\ref{tlwo:fig:metrics:hitsperm} in +Section~\ref{tlwo:sec:torcache:metrics}. One can confirm that this is most likely due +to Tor Browser opening several concurrent connections by referring to the +\texttt{network.http.max-persistent-con +nections-per-proxy} option and/or +enabling debug logging,\footnote{% + Enable debug logging in Tor Browser: \url{https://gitlab.torproject.org/tpo/applications/tor-browser/-/wikis/Hacking\#debugging-the-tor-browser} +} observing that multiple streams are often created to the same destination. +Note that these destinations are domains and not IPs, and that neither TB nor +the client-side Tor process has any notion of a DNS cache to prevent +cross-circuit fingerprinting (see Section~\ref{tlwo:sec:background:tor}). While a +hypothetical \emph{per-circuit client-side cache} would be an option, it would +per definition not be able to generate cache hits for concurrent resolves +(without violating circuit isolation, potentially leading to cross-circuit +fingerprinting) and put pressure on exits unless they do the appropriate +caching. This is why our design places the same-circuit cache at exits instead +of clients. + +A preload cache is also motivated by performance, however without any of the +harmful cross-circuit sharing. The remainder of this section explores the +performance impact of compiling an allowlist from popularity lists---like +Alexa~\cite{alexa}, Cisco Umbrella~\cite{umbrella}, and +Tranco~\cite{tranco}---by comparing the resulting cache-hit ratios to baseline +Tor today. The preloaded DNS cache is inspired by RFC 8767~\cite{rfc8767} which +allows resolvers to serve stale data in some cases (see +Section~\ref{tlwo:sec:related}). Here, exits keep domains on a preloaded allowlist +fresh on a best-effort level, serving stale records if necessary. Upon shutdown, +exits could persist IPs in the preload cache to disk as a starting point on +startup. Upon startup, if the preload cache have yet to be populated with IPs, +simply treat lookups as cache misses. We discuss periodic refresh overhead +further in Section~\ref{tlwo:sec:long:preload:resolverload}. + +\subsection{Data Collection} \label{tlwo:sec:long:preload:collection} + +As part of understanding Tor's DNS cache (Section~\ref{tlwo:sec:torcache}) we also +collected data to be able to evaluate the performance of the preload design. In +particular, we evaluate different popularity lists, the impact on cache-hit +ratio, estimated DNS cache size, and how these change over time. + +Central to the preload design is domain popularity lists. We included the Alexa +list~\cite{alexa} because that is what Mani \emph{et~al.} showed to be accurate for +Tor~\cite{ManiWJJS18}, the newer Tranco list because it may be more +accurate~\cite{tranco}, and the Cisco Umbrella list because it also contains +``non-web'' domains~\cite{umbrella}. + +In addition to considering the popularity lists, we also created \emph{extended} +lists from Alexa and Tranco by visiting each domain on those lists using the +Chromium browser and recording all requests for additional domains. We repeated +this three times from Europe, twice from the US, and twice from Hong Kong by +using a popular VPN service. Each visit was given a timeout of 20 seconds. No +pruning of the resulting extended lists of domains was done. Much can likely be +done to make these lists of domains significantly more comprehensive (e.g., by +considering popular subpages that might contain domains not on the front-page of +websites) and smaller (e.g., by pruning unique tracking domains: in one of our +biggest lists, \texttt{*.safeframe.googlesyndication.com} makes up 8\% of +domains with unique tracking subdomains with no value for caching). Another +direction to explore that could result in lists that are smaller and/or more +comprehensive would be to tailor them specifically for relays in certain +regions. For example, website visits from Europe may be treated differently by +website operators due to regulations like the GDPR. (In other words, there could +be differences with regards to \emph{domains}---not to be confused with IPs that +each relay already resolves locally---that are encountered during website +visits.) + +Based on the regular and extended popularity lists, we made several lists from +top-10 up to and including top-10,000 in increments. Further, the weekend before +each of the \emph{first four weeks} of data collection (see +Section~\ref{tlwo:sec:torcache}), we downloaded fresh popularity lists (Fridays) and +generated new extended lists (Saturdays and Sundays). We generated in total +$4*20*5 = 400$ lists: for the first four weeks, 20 lists each for \{Alexa, +Tranco, Umbrella, extended Alexa, extended Tranco\}. + +Our data collection involving the lists took place in three phases. The first +phase consisted of the first four weeks with increasingly more lists, which was +followed by two weeks of analysis of our results and dialog with the Tor +Research Safety Board. This lead us to the third and final phase of data +collection where we excluded the vast majority of lists, focusing only on +getting extended data for about eleven more weeks on the most informative and +useful lists (see Section~\ref{tlwo:sec:long:evaluation}). + +\subsection{Further Ethical Considerations} \label{tlwo:sec:long:preload:ethics} + +We discussed the preload additions as part of our other data collection, +received feedback from the Tor Research Safety Board, and passed our +university's ethical review process. + +Our rationale for why including counters for preload lists is safe was as +follows. We collect counters of aggregate lookups that would have been +cache-hits on each list over 15 minutes. Except for the top-10 lists +(non-extended), all other lists contain in the order of 100--1,000 unique +domains aggregated into a single counter. The main harm associated with the +dataset is if they enable an attacker to \emph{rule out} that a particular +website or Tor-user activity took place at our exits (see following paragraph). +So, little to no zero counters in our data is what we set out to achieve. As an +additional safety precaution our exits only have a $0.1$\% exit probability, +further making any zero counter less useful. + +Let us look more closely at the potential harm. For websites, the results of Mani +\emph{et~al.}~\cite{ManiWJJS18} tell an attacker to expect a power-law-like +distribution of website popularity in the network. As discussed in +Section~\ref{tlwo:sec:torcache:ethics}, we expect on average about 725 website visits +to each exit per 15 minute period. This is \emph{the prior of an attacker} +wishing to perform confirmation or correlation attacks. Most of the visits +should be to popular websites (per definition) and if the dataset allows an +attacker to rule such visits out it may cause harm because it is useful +information to the attacker~\cite{wfwo}. Because of this, we grouped our lists +into intervals of 100s (for top-?00) and 1000s (for top-?000). We stopped at +top-10k because we estimated little utility of caching domains of even less +popular websites. Further, to illustrate when the type of data we collect can be +harmful, the results of Mani \emph{et~al.}~\cite{ManiWJJS18} and Pulls and +Dahlberg~\cite{wfwo} tell us that at some point the logic becomes flipped in +terms of attacker utility: confirming that it was possible that a visit took +place to a \emph{rarely visited website} is useful. The popularity (i.e., +network base rate) of websites is central. We set out to only collect data on +the most popular of websites/domains, so for us, the focus is on when the +attacker can rule out website visits or some user activity: an attacker already +expects that popular websites/domains are visited. + +We analyzed the 1,330,400 sample counters we collected over the first four weeks +for different popularity lists. We found 33 zero counters. All of them belonged +to Alexa top-10 lists from different weeks! Besides Alexa top-10, the next list +with the lowest counter was Tranco top-100 from 20 May with 39 hits. Finding +empty counters for Alexa top-10 was at first very surprising, because the list +contains the most popular websites on the web (e.g., from 20 May: +\texttt{google.com}, \texttt{youtube.com}, \texttt{baidu.com}, +\texttt{facebook.com}, \texttt{instagram.com}, \texttt{bilibili.com}, +\texttt{qq.com}, \texttt{wikipedia.org}, \texttt{amazon.com}, and +\texttt{yahoo.com}). H-owever, note how the actual \emph{domains} on the list (of +\emph{websites}) do not contain the \texttt{www} prefix nor any other popular subdomain +associated with the sites. This highlights how poor the regular non-extended +lists are at capturing actual \emph{website} traffic. We can further see this +for both Alexa and Tranco in Figure~\ref{tlwo:fig:preload:heatmap}, presented next in +Section~\ref{tlwo:sec:long:preload:lists}. Even the top-10k lists have low +cache-hit ratios. + +By comparing a list with a more popular list (which should be a strict subset) +and observing the same counter value it is also possible to infer that +\emph{likely} no visits took place to the unique domains on the less popular +list. (This could happen by chance though.) We found 16,055 (1.2\%) such +samples: 5,073 to top-10k lists, 3,703 to top-[1k,10k) lists, and 7,279 to +top-[200,1k) lists. None of them were to top-100 lists. This might seem alarming +at first glance, but taking a closer look at the lists we find that only 135 of +the samples were to extended lists (77 to x-Tranco top-10k, the highest rank +list was x-Tranco top-600 with one sample). Further, only five of the samples +belonged to a list from Umbrella. The remaining 15,915 samples were to the +regular (non-extended) Alexa and Tranco lists. This is of limited use to +attackers for popular domains, because while the lists capture popular +\emph{websites}, our dataset contains counters of \emph{aggregate domain +lookups}. An inferred zero counter \emph{does not mean that no visits took +place} to \emph{websites} for the \emph{non-extended} lists. For example, if you +enter \texttt{www.google.com} or \texttt{www.wikipedia.org} into Tor Browser, +neither \texttt{google.com} nor \texttt{wikipedia.org} are actually connected +to. The recursive resolver of the exit may perform the lookup, but Tor will not, +so it is not captured in our dataset for the non-extended lists. The extended +lists, due to being generated from actual website visits, include domains +typically connected to by Tor Browser. Another example is users visiting direct +links to websites and not entering the domain manually in the browser, such as +when following links from search engines or sent through social media. + +When designing our measurements the above detail was not considered. We included +the regular popularity lists for sake of comparison. Ideally the non-extended +lists would have been effective sources for preload lists. This was evidently +not the case for Alexa and Tranco (see later results), but was the case for +Umbrella. So while what we did learn helped us understand the value of using +extended lists to improve cache hits, in hindsight we could have come to the +same conclusion without the same granularity for non-extended lists. + +In the second phase of our data collection (see +Section~\ref{tlwo:sec:long:preload:collection}), we discussed the above detail with +the Tor Research Safety Board and concluded to stop collecting data for +(non-extended) Alexa and Tranco, and to minimize the lists for future collection +to those necessary to determine the longevity of potentially useful preload +lists (based on our findings). Out of an abundance of caution, we will only +share the collected counters for non-extended Alexa and Tranco lists with +researchers for research purposes (the rest of the data is public). The counters +collected during the second phase were consistent with the data from the first +phase. + +During the third phase of data collection, we limited the collection to extended +Tranco top-\{10, 100, 1k, 2k, 4k, 5k, 7k, 10k\} lists and the Umbrella top-10k +list, all from April 29. The goal was to learn how cache hits get worse over +time with old lists. Out of 141,624 sample counters collected, three were zero +and 59 were relatively zero when compared to the more popular list. + +\subsection{Performance Evaluation} \label{tlwo:sec:long:evaluation} + +The goal of our evaluation is to determine over time: + cache-hit ratio of potential preload lists (Section~\ref{tlwo:sec:long:preload:lists}), + memory usage at exits (Section~\ref{tlwo:sec:long:preload:entries}), and + resolver load (Section~\ref{tlwo:sec:long:preload:resolverload}). + +\subsubsection{Results: Preload Lists} \label{tlwo:sec:long:preload:lists} + +\begin{figure} + \centering + \subfloat[][web]{% + \includegraphics[width=.75\columnwidth]{src/tlwo/img/plot_preload_lists-web.pdf} + \label{tlwo:fig:preload:heatmap:web} + }\\ + \subfloat[][permissive]{% + \includegraphics[width=.75\columnwidth]{src/tlwo/img/plot_preload_lists-permissive.pdf} + \label{tlwo:fig:preload:heatmap:perm} + } + \caption{% + Shared cross-circuit cache-hit ratios (\%) for selected preload lists during + the first six weeks (x-axis) of data collection. The plotted values are medians over + 24h, and dates on the y-axis show the date of original list download.} + \label{tlwo:fig:preload:heatmap} +\end{figure} + +Our dataset is extensive with 2,498,424 sample counters from 400 popularity +lists spanning about four months. Figure~\ref{tlwo:fig:preload:heatmap} shows +comprehensive heatmaps of shared cross-circuit cache-hit ratios for the web +(Figure~\ref{tlwo:fig:preload:heatmap:web}) and permissive +(Figure~\ref{tlwo:fig:preload:heatmap:perm}) exits over the first six weeks of data +collection (first and second phases). Cache-hit ratios are medians (very similar +to the mean) for 24h periods. In each figure, the groupings of the four weeks +when we added new lists are visible (top to bottom), as well as baseline Tor at +the bottom row for sake of comparison. Note how the regular Alexa and Tranco +top-10k lists perform poorly: the two black ($<5\%$ cache-hit ratio) lines at +the top of each grouping. Even Umbrella 1k is better, with Umbrella 10k being +comparable to baseline Tor. The extended lists clearly improve over baseline +Tor, with the extended 10k-lists even reaching over 30\% cross-circuit cache-hit +ratios some days. Look at how the lists change over time: we see no real +difference between lists generated at end of April and those generated during +May, but consistent changes across all lists over time, likely due to varying +traffic at the exits. The differences between using Alexa or Tranco to generate +extended lists are negligible, so we focus on Tranco for the remainder of this +analysis as it is open, maintained, and a more recent source of website +popularity~\cite{tranco}. + + +\begin{figure}[!t] + \centering + \subfloat[][web]{% + \includegraphics[width=.67\columnwidth]{src/tlwo/img/plot_preload_hits-web.pdf} + \label{tlwo:fig:preload:hits:web} + }\\ + \subfloat[][permissive]{% + \includegraphics[width=.67\columnwidth]{src/tlwo/img/plot_preload_hits-permissive.pdf} + \label{tlwo:fig:preload:hits:perm} + } + \caption{% + Shared cross-circuit cache-hit ratios for eight different extended Tranco + lists, Umbrella top-10k, and Tor baseline during four months in 2022.} + \label{tlwo:fig:preload:hits} +\end{figure} + +Figure~\ref{tlwo:fig:preload:hits} shows the observed \emph{cross-circuit} cache-hit +ratios for eight different extended Tranco lists, Umbrella top-10k, and Tor +baseline. We used lists from the end of April because they have the most data. +As a baseline, Tor's current DNS cache has a mean cache-hit ratio of 11\% for +web and 17\% for permissive. In terms of different popularity lists, the regular +(non-extended) Tranco and Alexa lists are ineffective: the top-10k lists are +regularly below $5\%$ for web and permissive (see +Figure~\ref{tlwo:fig:preload:heatmap}). Umbrella top-10k does much better with mean +17\% (web) and 16\% (permissive). This is slightly worse (permissive) and +comparable (web) to baseline Tor. + +The extended lists show a further improvement, comparable in terms of +\emph{average} (full duration of lists) cross-circuit cache-hit ratios to +baseline Tor at top-200 for Alexa and Tranco for web and at top-700 for +permissive. The extended lists from top-1k get (depending on which of the +compiled extended Tranco lists) 20--24\% (web) and 15--18\% (permissive) and up +to 27--32\% (web) and 22--27\% (permissive) at 10k. There is very little gain +between top-7k and top-10k. In general, the extended lists do relatively worse +on the permissive exit and the Tor baseline is higher: this makes sense, since +Alexa and Tranco are focused on websites. This is further confirmed by Umbrella +doing better as a more general-purpose domain popularity list. + +Note that Figure~\ref{tlwo:fig:preload:hits} shows the cross-circuit cache-hit ratios +for a selection of the very first preload lists we created on the April 29. The +results are very encouraging: time seems to have only a slight detrimental +impact on cache hits. After four months the larger extended lists show a +noticable performance improvement over baseline, with the exception of an odd +spike in baseline in early September (we speculate that this is DDoS-related). +The robustness of preload lists removes one of the main downsides of the preload +design, i.e., to maintain and deliver a current list to exits. It is likely +feasible to ship hard-coded preload lists as part of regular Tor releases and +still improve performance, assuming that exit operators upgrade their software a +couple of times per year. + +\subsubsection{Results: Cache Entries} \label{tlwo:sec:long:preload:entries} + +\begin{figure} + \centering + \subfloat[][web]{% + \includegraphics[width=.67\columnwidth]{src/tlwo/img/plot_preload_entries-web.pdf} + \label{tlwo:fig:preload:entries:web} + }\\ + \subfloat[][permissive]{% + \includegraphics[width=.67\columnwidth]{src/tlwo/img/plot_preload_entries-permissive.pdf} + \label{tlwo:fig:preload:entries:perm} + } + \caption{% + Estimated cache entries for eight different extended Tranco lists, Umbrella + top-10k, and Tor baseline.} + \label{tlwo:fig:preload:entries} +\end{figure} + +Figure~\ref{tlwo:fig:preload:entries} shows the number of cache entries needed in Tor +as-is (``baseline Tor'') and for the preload design for a range of different +popularity lists. We can accurately estimate an upper bound because we collected +the total number of entires in all same-circuit caches as part of our +measurements. This count is an upper bound, because some of those entries would +have already been cached in the preload cache. The popularity lists have static +sizes, and to be an accurate upper bound we used the largest observed size for +each list over the four weeks. + +Starting with the same-circuit cache, look at the line for extended Tranco +top-10 (``x-Tranco 10'') in Figure~\ref{tlwo:fig:preload:entries}: this extended list +contains only 90 entries, so the lines at the bottom show mostly the number of +entries used by the same circuit cache. The size of the same-circuit caches +should be proportional to the number of open circuits, and therefore follow exit +probability. Based on the data from Figure~\ref{tlwo:fig:preload:entries}, we do not +suspect this to be a significant burden. It would be trivial to cap the size +and/or prune the size as part of OOM-management, or dropping entries based on +their age would probably have little impact on performance (presumably most +value is at the start of the circuit when most streams are attached). + +Recall from Section~\ref{tlwo:sec:torcache:metrics} and +Figures~\ref{tlwo:fig:metrics:cacheweb} and~\ref{tlwo:fig:metrics:cacheperm} that the +permissive exit had a mean of 12,130 entries compared to the web exit's 7,672 +mean. We see the same figures for the baseline in +Figure~\ref{tlwo:fig:preload:entries}. Albeit slightly higher on average for the web +exit but more stable, we see that Umbrella 10k as well as extended Tranco top-1k +are about the same as Tor baseline. So with about the same memory usage as now +the preload design would offer slightly (permissive) or noticeably (web) better +cache-hit ratios. Looking at the top-2k up until top-10k extended lists we see a +significant higher memory usage (only slightly sublinear) but that comes with +significantly higher cache-hit ratios, as seen in Figure~\ref{tlwo:fig:preload:hits}. +In absolute terms, for extended Tranco top-10k, about 60,000 cache +entries---even if pessimistically assuming 1 KiB per entry---would end up using +about 60 MiB of memory for the cache. Since domains can be at most 255 bytes and +most domains are much shorter, one could clearly implement the cache +more memory-efficiently. Also, as mentioned earlier, it is likely possible to +reduce the size of the extended top-10k lists by removing useless tracking +domains. Further note that the memory needed to cache the preload list---unlike +the same-circuit cache---only depends on the size of the list, not the number +circuits or streams created at the exit. + +\subsubsection{Results: Resolver Load} \label{tlwo:sec:long:preload:resolverload} + +In general, on the one hand, improving cache-hit ratios will reduce resolver load +and scale well with increased traffic. On the other hand, continuously +refreshing domains on the preload list increases resolver load. Consider the +mean number of lookups at the web exit, 17,529, and its mean/median cache-hit +ratio of 0.80 (see Section~\ref{tlwo:sec:torcache}). This implies an expected +$3.9\gets\frac{17529(1-0.80)}{15\cdot60}$ requests per second to the exit's +resolver. For the permissive exit we observed about 7.8 requests per second. As +a source of comparison, Sonntag~\cite{sonntag-metrics,Sonntag19} reports for a +DNS resolver dedicated to a 200 Mbit/s exit in 2018 an average of 18.5 requests +per second. + +The resolver load for the different preload lists should be proportional to the +estimated number of cache entries shown in Figure~\ref{tlwo:fig:preload:entries}. The +estimated load for an extended top-1k list would be similar to current Tor, +while the extended top-10k list would see about a seven-fold increase without +changes. This may or may not be problem. Given the variability of lookups we +observed throughout our data collection (Figure~\ref{tlwo:fig:lookups}) and reported +by Sonntag, resolvers are clearly capable of dealing with increased loads. +Requests due to the preload list should be predictable, consistent, and cheap in +terms of bandwidth even for a low-capacity exit. + +Regardless, the load on resolvers could be lowered by reducing the number of +domains, e.g., the increased cache-hit ratio from top-7k to top-10k is very +small ($\approx$1\%) for a 20--30\% increase in entries. One could also increase +the internal TTLs, i.e., the frequency of refreshing the entries in the preload +cache. In Tor, this is especially practical since circuits use random exits. In +the rare case of stale data causing issues, simply create a fresh circuit. +Serving stale data is not uncommon in DNS~\cite{rfc8767}, further discussed next +in Section~\ref{tlwo:sec:related}. diff --git a/summary/src/tlwo/src/ref.bib b/summary/src/tlwo/src/ref.bib new file mode 100644 index 0000000..c62bea8 --- /dev/null +++ b/summary/src/tlwo/src/ref.bib @@ -0,0 +1,352 @@ +@misc{ctor-1, + author = {Tor Project}, + title = {{Tor source code}}, + howpublished = {\url{https://gitlab.torproject.org/tpo/core/tor/-/blob/tor-0.4.7.7/src/feature/relay/dns.c\#L600-689}, accessed 2022-06-01}, +} + +@misc{ctor-2, + author = {Tor Project}, + title = {{Tor source code}}, + howpublished = {\url{https://gitlab.torproject.org/tpo/core/tor/-/blob/tor-0.4.7.7/src/lib/tls/buffers\_tls.c\#L43-100}, accessed 2022-06-01}, +} + +@misc{ctor-3, + author = {Tor Project}, + title = {{Tor source code}}, + howpublished = {\url{https://gitlab.torproject.org/tpo/core/tor/-/blob/tor-0.4.7.7/src/core/or/connection\_or.c\#L2361-2426}, accessed 2022-06-01}, +} + +@misc{ctor-4, + author = {Tor Project}, + title = {{Tor source code}}, + howpublished = {\url{https://gitlab.torproject.org/tpo/core/tor/-/blob/tor-0.4.7.7/src/feature/relay/dns.c\#L725-732}, accessed 2022-06-01}, +} + +@misc{network-ddos, + author = {Tor Project}, + title = {{Tor Project status}}, + howpublished = {\url{https://web.archive.org/web/20220906145324/https://status.torproject.org/}}, +} + +@article{jung, + author = {Jaeyeon Jung and Emil Sit and Hari Balakrishnan and Robert Tappan Morris}, + title = {{DNS} performance and the effectiveness of caching}, + journal = {{IEEE/ACM} Trans. Netw.}, + volume = {10}, + number = {5}, + year = {2002}, +} + +@article{hao-and-wang, + author = {Shuai Hao and Haining Wang}, + title = {Exploring Domain Name Based Features on the Effectiveness of {DNS} Caching}, + journal = {CCR}, + volume = {47}, + number = {1}, + year = {2017}, +} + + + +@inproceedings{trilemma, + author = {Debajyoti Das and Sebastian Meiser and Esfandiar Mohammadi and Aniket Kate}, + title = {Anonymity Trilemma: Strong Anonymity, Low Bandwidth Overhead, Low Latency---Choose Two}, + booktitle = {{IEEE} {S\&P}}, + year = {2018}, +} + +@article{remote-timing-attacks, + author = {Scott A. Crosby and Dan S. Wallach and Rudolf H. Riedi}, + title = {Opportunities and Limits of Remote Timing Attacks}, + journal = {{ACM} Trans. Inf. Syst. Secur.}, + volume = {12}, + number = {3}, + year = {2009}, +} + + +@inproceedings{lucky13, + author = {Nadhem J. AlFardan and Kenneth G. Paterson}, + title = {Lucky Thirteen: Breaking the {TLS} and {DTLS} Record Protocols}, + booktitle = {{IEEE} {S\&P}}, + year = {2013}, +} + +@inproceedings{timing-attack, + author = {Paul C. Kocher}, + title = {Timing Attacks on Implementations of {Diffie-Hellman}, {RSA}, {DSS}, and Other Systems}, + booktitle = {{CRYPTO}}, + year = {1996}, +} + +@inproceedings{tor, + author = {Roger Dingledine and Nick Mathewson and Paul F. Syverson}, + title = {Tor: The Second-Generation Onion Router}, + booktitle = {USENIX Security}, + year = {2004}, +} + +@article{wfwo, + author = {Tobias Pulls and Rasmus Dahlberg}, + title = {Website Fingerprinting with Website Oracles}, + journal = {PETS}, + volume = {2020}, + number = {1}, +} + +@inproceedings{timeless, + author = {Tom van Goethem and Christina P{\"{o}}pper and Wouter Joosen and Mathy Vanhoef}, + title = {Timeless Timing Attacks: Exploiting Concurrency to Leak Secrets over Remote Connections}, + booktitle = {{USENIX} Security}, + year = {2020}, +} + +@inproceedings{sonntag-metrics, + author = {Michael Sonntag}, + title = {{DNS} Traffic of a {Tor} Exit Node---An Analysis}, + booktitle = {{SpaCCS}}, + year = {2018}, +} + +@inproceedings{ManiWJJS18, + author = {Akshaya Mani and T. Wilson{-}Brown and Rob Jansen and Aaron Johnson and Micah Sherr}, + title = {Understanding {Tor} Usage with Privacy-Preserving Measurement}, + booktitle = {{IMC}}, + year = {2018}, +} + +@inproceedings{tranco, + author = {{Le Pochat}, Victor and {Van Goethem}, Tom and Tajalizadehkhoob, Samaneh and Korczy\'{n}ski, Maciej and Joosen, Wouter}, + title = {Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation}, + booktitle = {NDSS}, + year = {2019}, +} + +@misc{umbrella, + author = {Cisco}, + title = {Umbrella Popularity List}, + howpublished = {\url{https://umbrella-static.s3-us-west-1.amazonaws.com/index.html}, accessed 2022-04-29}, +} + +@misc{alexa, + author = {Amazon}, + title = {Alexa Top 1 Million}, + howpublished = {\url{https://www.alexa.com/}, accessed 2022-04-29}, +} + +@inproceedings{GreschbachPRWF17, + author = {Benjamin Greschbach and Tobias Pulls and Laura M. Roberts and Philipp Winter and Nick Feamster}, + title = {The Effect of {DNS} on {Tor}'s Anonymity}, + booktitle = {NDSS}, + year = {2017}, +} + +@techreport{rfc8767, + author = {David C Lawrence and Warren Kumari and Puneet Sood}, + title = {Serving Stale Data to Improve {DNS} Resiliency}, + type = {RFC}, + institution = {IETF}, + number = {8767}, + year = {2020}, +} + +@inproceedings{Sonntag19, + title = {Malicious {DNS} Traffic in {Tor}: Analysis and Countermeasures}, + author = {Sonntag, Michael}, + booktitle = {ICISSP}, + year = {2019} +} + +@inproceedings{privcount, + author = {Rob Jansen and Aaron Johnson}, + title = {Safely Measuring {Tor}}, + booktitle = {{CCS}}, + year = {2016}, +} + +@inproceedings{privex, + author = {Tariq Elahi and George Danezis and Ian Goldberg}, + title = {{PrivEx}: Private Collection of Traffic Statistics for Anonymous Communication Networks}, + booktitle = {{CCS}}, + year = {2014}, +} + + +@inproceedings{FenskeMJS17, + author = {Ellis Fenske and Akshaya Mani and Aaron Johnson and Micah Sherr}, + title = {Distributed Measurement with Private Set-Union Cardinality}, + booktitle = {{CCS}}, + year = {2017}, +} + +@inproceedings{JansenTH18, + author = {Rob Jansen and Matthew Traudt and Nicholas Hopper}, + title = {Privacy-Preserving Dynamic Learning of {Tor} Network Traffic}, + booktitle = {{CCS}}, + year = {2018}, +} + +@inproceedings{MouraHMSD18, + author = {Giovane C. M. Moura and John S. Heidemann and Moritz M{\"{u}}ller and Ricardo de Oliveira Schmidt and Marco Davids}, + title = {When the Dike Breaks: Dissecting {DNS} Defenses During {DDoS}}, + booktitle = {{IMC}}, + year = {2018}, +} + +@misc{unbound, + author = {{NLnet Labs}}, + title = {Serving Stale Data---Unbound 1.14.0 documentation}, + howpublished = {\url{https://unbound.docs.nlnetlabs.nl/en/latest/topics/serve-stale.html}, accessed 2022-06-01}, +} + +@article{cheng1998traffic, + title = {Traffic analysis of {SSL} encrypted web browsing}, + author = {Cheng, Heyning and Avnur, Ron}, + journal = {Project paper, University of Berkeley}, + year = {1998} +} + +@inproceedings{DBLP:conf/sp/SunSWRPQ02, + author = {Qixiang Sun and Daniel R. Simon and Yi{-}Min Wang and Wilf Russell and Venkata N. Padmanabhan and Lili Qiu}, + title = {Statistical Identification of Encrypted Web Browsing Traffic}, + booktitle = {{IEEE S\&P}}, + year = {2002} +} + +@inproceedings{Hintz02, + author = {Andrew Hintz}, + title = {Fingerprinting Websites Using Traffic Analysis}, + booktitle = {{PETS}}, + year = {2002} +} + +@inproceedings{DBLP:conf/ccs/LiberatoreL06, + author = {Marc Liberatore and Brian Neil Levine}, + title = {Inferring the source of encrypted {HTTP} connections}, + booktitle = {{CCS}}, + year = {2006} +} + +@inproceedings{HerrmannWF09, + author = {Dominik Herrmann and Rolf Wendolsky and Hannes Federrath}, + title = {Website fingerprinting: attacking popular privacy enhancing technologies with the multinomial na{\"{\i}}ve-bayes classifier}, + booktitle = {{CCSW}}, + year = {2009} +} + +@inproceedings{PanchenkoNZE11, + author = {Andriy Panchenko and Lukas Niessen and Andreas Zinnen and Thomas Engel}, + title = {Website fingerprinting in onion routing based anonymization networks}, + booktitle = {{WPES}}, + year = {2011} +} + +@inproceedings{PochatGJ19, + author = {Victor Le Pochat and Tom van Goethem and Wouter Joosen}, + title = {Evaluating the Long-term Effects of Parameters on the Characteristics of the {Tranco} Top Sites Ranking}, + booktitle={{USENIX Security}}, + year={2019} +} + +@inproceedings{JuarezAADG14, + author = {Marc Ju{\'{a}}rez and Sadia Afroz and Gunes Acar and Claudia D{\'{\i}}az and Rachel Greenstadt}, + title = {A Critical Evaluation of Website Fingerprinting Attacks}, + booktitle = {{CCS}}, + year = {2014}, +} + +@misc{perryCrit, + author = {Mike Perry}, + title = {A Critique of Website Traffic Fingerprinting Attacks}, + howpublished = {\url{https://blog.torproject.org/critique-website-traffic-fingerprinting-attacks}, accessed 2022-06-01}, +} + +@article{realistic, + author = {Tao Wang and Ian Goldberg}, + title = {On Realistically Attacking {Tor} with Website Fingerprinting}, + journal = {PETS}, + volume = {2016}, + number = {4}, +} + +@inproceedings{JohnsonWJSS13, + author = {Aaron Johnson and Chris Wacek and Rob Jansen and Micah Sherr and Paul F. Syverson}, + title = {Users get routed: traffic correlation on {Tor} by realistic adversaries}, + booktitle = {{CCS}}, + year = {2013} +} + +@inproceedings{deepcorr, + author = {Milad Nasr and Alireza Bahramali and Amir Houmansadr}, + title = {DeepCorr: Strong Flow Correlation Attacks on {Tor} Using Deep Learning}, + booktitle = {{CCS}}, + year = {2018} +} + +@inproceedings{AumannL07, + author = {Yonatan Aumann and Yehuda Lindell}, + title = {Security Against Covert Adversaries: Efficient Protocols for Realistic Adversaries}, + booktitle = {{TCC}}, + year = {2007}, +} + +@inproceedings{chen, + author = {Yizheng Chen and Manos Antonakakis and Roberto Perdisci and Yacin Nadji and David Dagon and Wenke Lee}, + title = {{DNS} Noise: Measuring the Pervasiveness of Disposable Domains in Modern {DNS} Traffic}, + booktitle = {{IEEE/IFIP} {DSN}}, + year = {2014}, +} + + +@inproceedings{exitmap, + author = {Philipp Winter and Richard K{\"{o}}wer and Martin Mulazzani and Markus Huber and Sebastian Schrittwieser and Stefan Lindskog and Edgar R. Weippl}, + title = {Spoiled Onions: Exposing Malicious {Tor} Exit Relays}, + booktitle = {{PETS}}, + year = {2014}, +} + +@inproceedings{SibyJDVT20, + author = {Sandra Siby and Marc Ju{\'{a}}rez and Claudia D{\'{\i}}az and Narseo Vallina{-}Rodriguez and Carmela Troncoso}, + title = {Encrypted {DNS} -{\textgreater} Privacy? {A} Traffic Analysis Perspective}, + booktitle = {NDSS}, + year = {2020}, +} + +@misc{tpo-russia, + author = {Gustavo Gus}, + title = {Responding to {Tor} censorship in {Russia}}, + year = {2021}, + howpublished = {\url{https://blog.torproject.org/tor-censorship-in-russia/}, accessed 2022-06-01}, +} + +@misc{tpo-who-uses-tor, + author = {Al Smith}, + title = {Tor and the humans who use it}, + howpublished = {\url{https://blog.torproject.org/tor-and-the-humans-who-use-it/}, accessed 2022-06-01}, +} + +@misc{tb, + author = {Mike Perry and Erinn Clark and Steven Murdoch and Georg Koppen}, + title = {The Design and Implementation of the {Tor Browser} {[DRAFT]}}, + howpublished = {\url{https://2019.www.torproject.org/projects/torbrowser/design/}, accessed 2022-06-01}, +} + +@misc{tor-congestion, + author = {Mike Perry}, + title = {Congestion Control Arrives in {Tor} 0.4.7-stable!}, + howpublished = {\url{https://blog.torproject.org/congestion-contrl-047/}, accessed 2022-06-01}, +} + +@inproceedings{histor, + author = {Akshaya Mani and Micah Sherr}, + title = {HisTor{\(\epsilon\)}: Differentially Private and Robust Statistics Collection for {Tor}}, + booktitle = {NDSS}, + year = {2017}, +} + +@inproceedings{onlinewf, + title={Online Website Fingerprinting: Evaluating Website Fingerprinting Attacks on {Tor} in the Real World}, + author={Cherubin, Giovanni and Jansen, Rob and Troncoso, Carmela}, + booktitle={{USENIX Security}}, + year={2022} +} diff --git a/summary/src/tlwo/src/related.tex b/summary/src/tlwo/src/related.tex new file mode 100644 index 0000000..3521d45 --- /dev/null +++ b/summary/src/tlwo/src/related.tex @@ -0,0 +1,174 @@ +\section{Related Work} \label{tlwo:sec:related} + +Van Goethem \emph{et~al.}~\cite{timeless} originally proposed timeless timing +attacks, showing significant improvements against HTTP/2 web servers, Tor onion +services, and EAP-pwd. All timeless timing attacks exploit concurrent +processing, e.g., in HTTP/2, by filling buffers at the relay closest to an onion +service, or packing two authentication requests in EAP-pwd into the same RadSec +(TLS over TCP) packet. The latter was the inspiration for our timeless timing +attack on Tor's DNS cache, i.e., packing two RESOLVE cells into a single TLS +record. + +There has been a long body of work on how to safely perform measurements of the +Tor network~\cite{privex,FenskeMJS17,privcount,histor}, laying the foundation +for safely performing large-scale measurements~\cite{JansenTH18,ManiWJJS18}. Our +timeless timing attack enables anyone to do network-wide measurements for exact +domains on specific exits with a precision of at least one second. This is +highly invasive and a useful resource to deanonymize Tor-users, discussed +further shortly. Our network measurements to inform the design of defenses have +been focused around the DNS in Tor. Similar to other related work (see below), +we focused on how to make those limited measurements safe; not on how to broadly +perform a much wider range of measurements safely. + +%% ATTACK +Greschbach \emph{et~al.}~\cite{GreschbachPRWF17} investigated the effect of DNS on +Tor's anonymity. They quantified the use of DNS resolvers in the network, the +impact of choice of resolver on correlation and confirmation attacks, and how to +incorporate observed DNS traffic with website fingerprinting +attacks~\cite{cheng1998traffic,HerrmannWF09,Hintz02,DBLP:conf/ccs/LiberatoreL06,PanchenkoNZE11,DBLP:conf/sp/SunSWRPQ02} +to make improved correlation attacks. In their construction, DNS traffic is used +to either reduce the number of websites to consider during classification or to +confirm classification. A key observation was that Tor, due to a bug, clipped +all TTLs to 60 seconds. This was resolved and lead to the current approach of +clipping to 300 or 3,600 seconds. One of our short-time mitigations update +these clips to be fuzzy. + +Greschbach \emph{et~al.}~\cite{GreschbachPRWF17} also measured DNS requests from +an exit for both web and a more permissive exit policy in 2016. The collection +was done by observing DNS requests to the exit's resolver and aggregating +results into five-minute buckets. Similarly, we aggregate over time in 15-minute +buckets and do not directly collect resolved domains. They found a small +difference between exit policies, with the permissive exit having slightly fewer +(3\% smaller median) lookups. Our results are very different: the permissive +exit policy resulted in significantly more (double the median) lookups. + +Pulls and Dahlberg~\cite{wfwo} generalized the classifier confirmation attack of +Greschbach \emph{et~al.}~\cite{GreschbachPRWF17} into a new security notion for website +fingerprinting attacks, and further explored the use of DNS. They showed that +timing attacks were possible in Tor's DNS cache, performing network-wide +measurements on a domain under their control with a true positive +rate of 17.3\% when attempting to minimize false positives. We use a similar +method for measurements, but our attack is significantly better with a +100\% true positive rate and no false positives at all. + +Sonntag collected hourly data from the resolver of an exit during five months in +2018~\cite{sonntag-metrics,Sonntag19}. In terms of frequency, they noted about +18.5 requests per second, with a peak of 291,472 requests in an hour. The +average is higher than ours (3.9 and 7.8 requests per second) while the peak +significantly smaller (1,183,275 requests in 15 minutes). Sonntag also analyzed +the actual domains looked up, including categorization (porn, social network, +shopping, advertisement etc). We do not collect domains; only cache-hits as part +of popularity lists by aggregating domains into buckets like top-100, top-1k, +etc. + +Mani \emph{et~al.}~\cite{ManiWJJS18} used PrivCount~\cite{privcount} and +PSC~\cite{FenskeMJS17} to safely make extensive network-wide measurements of the +Tor network. They measured, e.g., circuits, streams, destination ports, and exit +domains at exits, as well as client connections, churn, composition, and +diversity at clients. Their exit probability ranged between 1.5--2.2\%, compared +to our peak of 0.1\%. While our data is much more limited and targeted around +DNS, there are two interesting comparisons to consider: +\begin{itemize} + % mean lookups: 17529.813 and 41099.697 + % 15 minutes x4 = 1h, 24h per day + % mean exit probabilities: 0.041% and 0.049% + % (17529.813+41099.697)×4×24÷(0.01×(0.041+0.049)) = 6,253,814,400 + + \item Mani \emph{et~al.} observed 2.1 billion exit streams inferred in the + network every 24 hours. Extrapolating on our lookup statistics we have an + average of 6.3 billion lookups, which corresponds to the number of + streams.\footnote{Streams either do a lookup with RELAY\_BEGIN or are closed + after a RELAY\_RESOLVE cell. Timeout and retries are possible on resolver + failure, but the way we measure hides those extra lookups.} This suggests a + significant increase ($\approx3x$) in the number of streams in the Tor network since 2018. + \item Mani \emph{et~al.} measured the frequency of how well the primary + domain on a circuit matched the Alexa top-one-million list. We transform + their reported relative counts and compare it to the relative count of + average lookups in the same intervals in our dataset for top-10k, shown in + Figure~\ref{tlwo:fig:mani:popularity}. Note that this only uses data from phase + one of our data collection. Broadly, we see that their results show + significantly more traffic to top-10 than any of the lists we use. That + said, our data supports one of Mani \emph{et~al.}'s conclusion that the + popularity lists are reasonably accurate representations of traffic from the + Tor network. +\end{itemize} + +\begin{figure}[!t] + \centering + \subfloat[][web]{% + \includegraphics[width=.67\columnwidth]{src/tlwo/img/plot_popularity_match-web.pdf} + \label{tlwo:fig:mani:popularity:web} + }\\ + \subfloat[][permissive]{% + \includegraphics[width=.67\columnwidth]{src/tlwo/img/plot_popularity_match-permissive.pdf} + \label{tlwo:fig:mani:popularity:perm} + } + \caption{% + Comparison of \emph{relative popularity} of popularity rankings with the + results of Mani \emph{et~al.}~\cite{ManiWJJS18}.} + \label{tlwo:fig:mani:popularity} +\end{figure} + +%% DNS prefetching paragraphs and software from there +% Defenses can expand on DNS prefetching +% https://datatracker.ietf.org/doc/html/rfc8767 +% https://unbound.docs.nlnetlabs.nl/en/latest/topics/serve-stale.html + +The relatively recent RFC 8767~\cite{rfc8767} allows for DNS data to be served +``stale'', i.e., after expiry according to its TTL, in the exceptional +circumstance that a recursive resolver is unable to refresh the information. In +case data goes stale, RFC 8767 suggests serving it for at most one to three days. +The background of RFC 8767 aptly motivates this with the saying that +\emph{``stale bread is better than no bread''}. In addition to serving +potentially stale data, modern resolvers like Unbound~\cite{unbound} further +support prefetching: preemptively refreshing domains in the cache before TTL +expiry. These measures all serve to improved reliability and have been found to +be used for sake of resiliency~\cite{MouraHMSD18}. Tor already clips TTLs, in a +sense serving stale data for the vast majority of domains. Our preload design +takes this further by introducing continuous prefetching of domains on a fixed +allowlist. + +Two decades ago, Jung \emph{et~al.}~\cite{jung} found that cache-hit ratios on +the order of 80--87\% are achieved if a resolver has ten or more clients and +TTLs are at least ten minutes. More recently Hao and Wang~\cite{hao-and-wang} +reported that 100k cached entries are required to achieve a baseline of 86\% +cache-hits for a first-come first-serve cache in a university network. Their +dataset had similar characteristics to a DNS trace collected for an ISP resolver +by Chen \emph{et~al.}~\cite{chen} with regards to \emph{disposable domains} that +are never requested more than once in the long-tail of DNS; out of the 11\% of +domains that are not disposable, 5\% and 30\% of them have cache-hit ratios of +at least 95\% and 80\% respectively. It appears that fewer disposable domains +are resolved in Tor because the observed cache sizes are not large enough for +89\% unique lookups. Achieving an 80\% cache-hit ratio with a cache of 10k +entries does not seem to be an outlier. + +% Chen et al. (2014), DNS noise: Measuring the pervasiveness of disposable domains in modern DNS traffic +% - ISP data set from end of 2011 +% - Fig 3, long-tail of domains with no additional look-ups (89%) +% - Fig 4, cache hit distribution +% - Fig 7, 30% of all non-unique domain names have at least 80% cache hit ratio +% and 5% have at least 95% cache-hit ratios. +% +% Hao and Wang. (2017) Exploring Domain Name Based Features on the Effectiveness of DNS Caching +% - University data set +% - "we observe that a size of 100,000 entries would have a cache hit rate of about 86%" +% - (context is to use this baseline for other policies than fifo/lru etc.) +% +% Avoiding to cache "disposable domains", i.e., domains that are only used +% once seems to be a thing to research. This is what Hao and Wang does. +% +% Jung et al. (2002) DNS Performance and the Effectiveness of Caching +% - MIT network 80-86% cache hits, much because multiple TCP connections from +% browsers +% - "We find that the distribution of names is Zipf-like, which immeaditely +% limits even the theoretical effectivness of caching" +% - 10 min TTL yields almost the same cache-hit ratio as a longer TTL +% - Already with 10 clients, same cache-hit ratio as 1000 clients +% +% https://developers.google.com/speed/public-dns/docs/performance +% - "While a few sites (and consequently DNS names) are very popular, most are of +% interest to only a few users and are accessed rarely; so the majority of +% requests result in cache misses" +% - (Could not find anything about Google/Cloudflare cache-hit ratios for DNS) +% - (Found a white paper saying 70-90%, but not a good source) +% - https://www.senki.org/wp-content/uploads/2017/08/213049567-How-to-Measure-Resolver-Performance.pdf diff --git a/summary/src/tlwo/src/short.tex b/summary/src/tlwo/src/short.tex new file mode 100644 index 0000000..6d06c86 --- /dev/null +++ b/summary/src/tlwo/src/short.tex @@ -0,0 +1,63 @@ +\section{Mitigations} \label{tlwo:sec:short} + +Until a more comprehensive defense can be deployed we propose two short-term +mitigations that require little (fuzzy TTLs) or no (cover lookups) changes to +Tor. The former adds some uncertainty with regards to when a domain was added +to an exit's DNS cache. The latter can remove or reduce the attacker's ability +to conduct attacks against specific domains but is limited in its scalability. + +\subsection{Fuzzy TTLs} \label{tlwo:sec:short:fuzzy} +Recall that it is possible to determine when a domain was inserted into an +exit's DNS cache (Section~\ref{tlwo:sec:attack:detailed}) once you know the time $t$ +when the timeless timing attack started, the duration until the domain was no +longer cached $x$ (repeated probes), and the expected clip value +$\mathsf{clipped\_ttl}$ of the domain. The idea of fuzzy TTLs is to add +uncertainty by randomizing the length of time that an entry is cached. + +In more detail, keep Tor's DNS cache as-is but sample the cache duration +uniformly at random from $[m, \mathsf{clipped\_ttl}]$, where $m$ is the minimum +duration to cache. Upon observing the exact time of removal $t+x$, the attacker +now learns that the domain has been in the cache for the duration $x$ and was +thus cached between $[t+x-\mathsf{clipped\_ttl}, t-m]$. Note that if $m = +\mathsf{clipped\_ttl}$, then $x = 0$; the same as in Tor today. + +The reality of websites is unfortunately that they consist of multiple domains, +reducing the effectiveness of fuzzy TTLs because the attacker uses the most +lucky sample. For a list of domains $d_1,\hdots,d_k$ that were added at the same +time with identical clips, then $x \leftarrow \mathsf{max}(x_1,\hdots,x_k)$. +Based on our preload list measurements presented in +Section~\ref{tlwo:sec:long:preload}, we expect around 8--13 domains per site +available for an attacker to potentially query for. Earlier work found +a median of two unique domains out of ten domains in total per website on Alexa +top 1M~\cite{GreschbachPRWF17}. + +Fuzzy TTLs are an ineffective mitigation if the attacker just wants to confirm +suspected activity with a low base rate, i.e., the mere existence of cached +domains anywhere in the network is enough of a signal~\cite{wfwo}. Fuzzy TTLs +are a plus for websites that are modestly popular in the network, since the +attacker has to determine which of several exits with cached domains is the +correct one. Having to consider multiple domains and exits (to narrow down the +exact time) is more noisy in the network and increases the risk of +detection~\cite{AumannL07}. Attackers may be forced to consider a time-window of +several seconds or even minutes, which is a big win for defending against +correlation and confirmation attacks~\cite{GreschbachPRWF17,wfwo}. + +\subsection{Cover Lookups} +The idea of the cover lookups mitigation is to simply inject domains into DNS +caches in the Tor network to create false positives. Injected domains must be +indistinguishable from domains cached from real Tor user activity. For this, a +distribution that models website visits for a particular base rate should be +used rather than running, e.g., a deterministic cron job. Further, care has to +be taken to capture all predictable domains for each website to defend. + +A more drastic mitigation would be to keep a site's domains cached at every exit +all the time, e.g., by running \texttt{exitmap}~\cite{exitmap} every five +minutes. This obviously scales poorly. The network overhead would already be +significant for a few hundred sites, e.g., estimates based on Alexa top-1k would +add about 26.7~requests per second to each exit. + +Cover lookups do not scale, even if just injected at few exits probabilistically +according to some target base rate. It is a last resort mitigation for site +operators that fear that their users are targeted by motivated attackers and +where, for some reason, the site cannot transition to being completely (no +resources loaded from other domains) hosted as an onion service. diff --git a/summary/src/tlwo/src/tor-cache.tex b/summary/src/tlwo/src/tor-cache.tex new file mode 100644 index 0000000..392b755 --- /dev/null +++ b/summary/src/tlwo/src/tor-cache.tex @@ -0,0 +1,167 @@ +\section{Tor's DNS Cache Today} \label{tlwo:sec:torcache} + +To better understand the DNS cache of Tor today, we set out to collect +performance metrics from exits in the live Tor network. +Section~\ref{tlwo:sec:torcache:ethics} covers ethical considerations, followed by +data collection in Section~\ref{tlwo:sec:torcache:collection} and resulting metrics +in Section~\ref{tlwo:sec:torcache:metrics}. + +\subsection{Ethical Considerations} \label{tlwo:sec:torcache:ethics} + +We submitted a proposal to the Tor Research Safety Board describing measurements +that would ultimately inform the design of a long-term defense +(Section~\ref{tlwo:sec:long}) against our improved attack (Section~\ref{tlwo:sec:attack}). +To be able to assess the impact of the defense we needed to better understand +the DNS cache Tor has today as a baseline. After a couple of iterations with the +Tor Research Safety Board we reached consensus, and then successfully completed +our university's ethical review process. The proposal also included measurements +needed for our defense, described later in +Section~\ref{tlwo:sec:long:preload:collection}. During the measurements period of +four months we consulted the Tor Research Safety Board to discuss our results. + +The intuition of our measurement is as follows. Two exit relays are operated to +collect counters related to domain lookups. For example, the number of lookups +and cache hits (Section~\ref{tlwo:sec:torcache:collection}). These counters are the +result of all traffic at the exit, aggregated over 15 minutes intervals before +being written to disk and then reset in memory. Based on an exit probability of +about $0.0005$ ($\approx 100$Mbit/s), we extrapolated from the measurements of +Mani \emph{et~al.}~\cite{ManiWJJS18} that we should expect about 725 website +visits during 15 minutes. Each website visit typically triggers multiple domain +lookups~\cite{GreschbachPRWF17} that affect our global counters. A collection +interval of 15 minutes should thus aggregate hundreds of website visits for a +small fraction of the network, making the resulting dataset hardly useful for an +attacker performing correlation or confirmation attacks on the network. +This sketch appears to be confirmed by our measurement results: out of 23,632 +15-minute intervals, only 18 contained less than 1,000 lookups. +Our conclusion together with the Tor Research Safety Board was that the +resulting dataset should be safe to make public (further discussed later). + +\subsection{Data Collection} \label{tlwo:sec:torcache:collection} + +Two 100 Mbit/s exit relays were operated by volunteers on the premises of +DFRI\footnote{More information about DFRI can be found at their website: +\url{https://www.dfri.se}.} +from May 2 until September 3, 2022. One exit was configured in its exit policy +with \emph{web} ports.\footnote{% + Reject all ports except 80 and 443. (The exit can still do DNS for users.) +} The other relay was configured with +\emph{permissive} ports to also allow non-web traffic.\footnote{% + Allow all ports except 25, 119, 135--139, 445, 563, 1214, 4661--4666, + 6346--6429, 6699, and 6881--6999. +} Otherwise the two exits were identical, +running on the same VM with a dedicated \texttt{unbound} process that had +caching disabled by setting the \texttt{rrset-cache-size} to zero (to avoid TTL +biases). We collected the following counters every 15 minutes at both exits: + +\begin{description} + \item[timestamp] UNIX timestamp when the data was collected. + \item[lookups] Total number of observed domain lookups. + \item[hits\_5m] Number of cache hits with a TTL of 300 seconds. + \item[hits\_60m] Number of cache hits with a TTL of 3,600 seconds. + \item[hits\_pending] Number of cache hits with a pending resolve, i.e., + an answer has been requested but is not yet available. + \item[hits\_same\_circuit] Number of streams that looked up a domain + that was previously looked up on the same circuit. + \item[num\_cache\_entries] Number of entries in Tor's DNS cache. +\end{description} + +A timestamp is needed to plot metrics as a function of time. Timestamps are +also crucial for the additional counters described in +Section~\ref{tlwo:sec:long:preload:collection}. The number of lookups and different +types of cache hits are needed to get a baseline of cache-hit ratios. The +number of entries in Tor's DNS cache (at the time of collection) is needed to +get a baseline of memory usage. The necessary Tor changes to collect all metrics +(including Section~\ref{tlwo:sec:long:preload:collection}) were relatively modest: +400 lines of code. + +\subsection{Metrics} \label{tlwo:sec:torcache:metrics} + +Regarding lookups per 15 minutes, the web exit processed a mean of 17,530 and +median of 13,393 lookups (Figure~\ref{tlwo:fig:metrics:lookweb}), and the permissive +exit processed a mean of 41,100 and median of 26,940 lookups +(Figure~\ref{tlwo:fig:metrics:lookperm}). The permissive exit policy results in +significantly more lookups. Around August 1, our exits experienced downtime, +visible as dips in lookups in both figures (at times fewer than 1,000 lookups, +as noted in Section~\ref{tlwo:sec:torcache:ethics}). Exit probability is weakly +correlated with lookups: Pearson correlation 0.30 (web) and 0.16 (permissive). + +\begin{figure}[!t] + \centering + \subfloat[][web]{% + \includegraphics[width=.5\columnwidth]{src/tlwo/img/plot_lookups-web.pdf} + \label{tlwo:fig:metrics:lookweb} + } + \subfloat[][permissive]{% + \includegraphics[width=.5\columnwidth]{src/tlwo/img/plot_lookups-permissive.pdf} + \label{tlwo:fig:metrics:lookperm} + } + \caption{Lookups every 15 minutes and exit probability.} + \label{tlwo:fig:lookups} +\end{figure} + +Figures~\ref{tlwo:fig:metrics:cacheweb} and~\ref{tlwo:fig:metrics:cacheperm} show the +number of entries in Tor's DNS cache. The web exit has a mean of 7,672 and +median of 7,325 entries, and the permissive exit a mean of 12,130 and median of +11,408 entries. Both appear relatively stable compared to the number of lookups +(note log-scale y-axis in Figure~\ref{tlwo:fig:lookups}). Likely, this is +because traffic on the Tor network is not uniformly distributed, but rather +concentrated to relatively few destinations, e.g., as shown with website +popularity~\cite{ManiWJJS18}. + +\begin{figure}[!t] + \centering + \subfloat[][web]{% + \includegraphics[width=.5\columnwidth]{src/tlwo/img/plot_cache_entries-web.pdf} + \label{tlwo:fig:metrics:cacheweb} + } + \subfloat[][permissive]{% + \includegraphics[width=.5\columnwidth]{src/tlwo/img/plot_cache_entries-permissive.pdf} + \label{tlwo:fig:metrics:cacheperm} + } + \caption{Cache entries every 15 minutes.} + \label{tlwo:fig:cache} +\end{figure} + +Central to a DNS cache is its \emph{cache-hit} ratio: how often lookups can be +resolved using cached entries instead of asking DNS resolvers. +Figures~\ref{tlwo:fig:metrics:hitsweb} and~\ref{tlwo:fig:metrics:hitsperm} show the +cache-hit ratios for the two exits, with a mean cache-hit ratio of 0.80 (web) +and 0.83 (permissive). We also show if the cache hits occurred due to a cache +entry used earlier on the same circuit (``same'') or from another circuit +(``shared''). Further, over all the cache hits, we show if the hits were because +of DNS entries with a five-minute cached TTL (``5min''), a 60-minute cached TTL +(``60min''), or pending entries in the DNS cache (``pending''). Same circuit +hits are likely due to Tor Browser improving performance by creating multiple +streams to the same destination. The cross-circuit cache-hit ratio is much +smaller (``shared'') with a mean of 0.11 (web) and 0.17 (permissive). We return +to these ratios in Section~\ref{tlwo:sec:long:evaluation} to compare with our +defense. + +\begin{figure}[!t] + \centering + \subfloat[][web]{% + \includegraphics[width=.67\columnwidth]{src/tlwo/img/plot_cache_hits-web.pdf} + \label{tlwo:fig:metrics:hitsweb} + }\\ + \subfloat[][permissive]{% + \includegraphics[width=.67\columnwidth]{src/tlwo/img/plot_cache_hits-permissive.pdf} + \label{tlwo:fig:metrics:hitsperm} + } + \caption{% + Cache-hit ratio every 15 minutes. The total ratio can be split + by same+shared hits or 60min+5min+pending hits. + } + \label{tlwo:fig:hits} +\end{figure} + +During the four months of measurements, our exits experienced sporadic downtime +(early August) and the Tor-network endured significant network DDoS +activities~\cite{network-ddos}. This shows in our data, e.g., with the drop to +close to zero lookups in Figure~\ref{tlwo:fig:lookups}, huge spikes of cached entries +in Figure~\ref{tlwo:fig:cache}, and periods where the cache-hit ratio was almost +one in Figure~\ref{tlwo:fig:hits}. + +To summarize, Tor's DNS cache has a cache-hit ratio over 80\% using a modestly +sized DNS cache. About 11--17\% of these hits are due to sharing the cache +across circuits. The number of lookups are weakly correlated to exit +probability. -- cgit v1.2.3