From 385cc92bc91e1a6c3724085c060e76bf40c13ed3 Mon Sep 17 00:00:00 2001 From: Rasmus Dahlberg Date: Tue, 15 Oct 2024 16:08:16 +0200 Subject: Import PhD thesis --- summary/src/cat/.gitignore | 9 + summary/src/cat/img/aws.pdf | Bin 0 -> 150359 bytes summary/src/cat/img/bt10.pdf | Bin 0 -> 154201 bytes summary/src/cat/img/bt100.pdf | Bin 0 -> 157181 bytes summary/src/cat/img/bt1000.pdf | Bin 0 -> 148806 bytes summary/src/cat/img/df_nodef.pdf | Bin 0 -> 118767 bytes summary/src/cat/img/df_wt.pdf | Bin 0 -> 105303 bytes summary/src/cat/img/df_wtfpad.pdf | Bin 0 -> 126269 bytes summary/src/cat/img/dns__classifier-idea.pdf | Bin 0 -> 86270 bytes summary/src/cat/img/dns__timing-dist.pdf | Bin 0 -> 98297 bytes summary/src/cat/img/dynaflow_config1.pdf | Bin 0 -> 115968 bytes summary/src/cat/img/dynaflow_config2.pdf | Bin 0 -> 127080 bytes summary/src/cat/img/dynaflow_nodef.pdf | Bin 0 -> 123795 bytes summary/src/cat/img/factor-fnp.pdf | Bin 0 -> 123720 bytes summary/src/cat/img/factor-recall.pdf | Bin 0 -> 145211 bytes summary/src/cat/img/probfp.pdf | Bin 0 -> 132759 bytes summary/src/cat/img/setting-oracle.pdf | Bin 0 -> 1316415 bytes summary/src/cat/img/setting.pdf | Bin 0 -> 1021741 bytes summary/src/cat/img/timeuntilvisited.pdf | Bin 0 -> 84191 bytes summary/src/cat/img/wang_csbuflo.pdf | Bin 0 -> 111918 bytes summary/src/cat/img/wang_nodef.pdf | Bin 0 -> 118619 bytes summary/src/cat/img/wang_tamaraw.pdf | Bin 0 -> 116397 bytes summary/src/cat/main.tex | 70 +++ summary/src/cat/src/abstract.tex | 25 + summary/src/cat/src/ack.tex | 9 + summary/src/cat/src/background.tex | 211 +++++++ summary/src/cat/src/bayes.tex | 96 +++ summary/src/cat/src/conclusions.tex | 25 + summary/src/cat/src/discussion.tex | 153 +++++ summary/src/cat/src/intro.tex | 122 ++++ summary/src/cat/src/lessons.tex | 47 ++ summary/src/cat/src/main.tex | 121 ++++ summary/src/cat/src/oracles.tex | 126 ++++ summary/src/cat/src/othersources.tex | 112 ++++ summary/src/cat/src/ref-min.bib | 837 +++++++++++++++++++++++++++ summary/src/cat/src/related.tex | 64 ++ summary/src/cat/src/sim.tex | 131 +++++ summary/src/cat/src/sources.tex | 204 +++++++ summary/src/cat/src/wf.tex | 181 ++++++ 39 files changed, 2543 insertions(+) create mode 100644 summary/src/cat/.gitignore create mode 100644 summary/src/cat/img/aws.pdf create mode 100644 summary/src/cat/img/bt10.pdf create mode 100644 summary/src/cat/img/bt100.pdf create mode 100644 summary/src/cat/img/bt1000.pdf create mode 100644 summary/src/cat/img/df_nodef.pdf create mode 100644 summary/src/cat/img/df_wt.pdf create mode 100644 summary/src/cat/img/df_wtfpad.pdf create mode 100644 summary/src/cat/img/dns__classifier-idea.pdf create mode 100644 summary/src/cat/img/dns__timing-dist.pdf create mode 100644 summary/src/cat/img/dynaflow_config1.pdf create mode 100644 summary/src/cat/img/dynaflow_config2.pdf create mode 100644 summary/src/cat/img/dynaflow_nodef.pdf create mode 100644 summary/src/cat/img/factor-fnp.pdf create mode 100644 summary/src/cat/img/factor-recall.pdf create mode 100644 summary/src/cat/img/probfp.pdf create mode 100644 summary/src/cat/img/setting-oracle.pdf create mode 100644 summary/src/cat/img/setting.pdf create mode 100644 summary/src/cat/img/timeuntilvisited.pdf create mode 100644 summary/src/cat/img/wang_csbuflo.pdf create mode 100644 summary/src/cat/img/wang_nodef.pdf create mode 100644 summary/src/cat/img/wang_tamaraw.pdf create mode 100644 summary/src/cat/main.tex create mode 100644 summary/src/cat/src/abstract.tex create mode 100644 summary/src/cat/src/ack.tex create mode 100644 summary/src/cat/src/background.tex create mode 100644 summary/src/cat/src/bayes.tex create mode 100644 summary/src/cat/src/conclusions.tex create mode 100644 summary/src/cat/src/discussion.tex create mode 100644 summary/src/cat/src/intro.tex create mode 100644 summary/src/cat/src/lessons.tex create mode 100644 summary/src/cat/src/main.tex create mode 100644 summary/src/cat/src/oracles.tex create mode 100644 summary/src/cat/src/othersources.tex create mode 100644 summary/src/cat/src/ref-min.bib create mode 100644 summary/src/cat/src/related.tex create mode 100644 summary/src/cat/src/sim.tex create mode 100644 summary/src/cat/src/sources.tex create mode 100644 summary/src/cat/src/wf.tex (limited to 'summary/src/cat') diff --git a/summary/src/cat/.gitignore b/summary/src/cat/.gitignore new file mode 100644 index 0000000..8bb88c8 --- /dev/null +++ b/summary/src/cat/.gitignore @@ -0,0 +1,9 @@ +main.pdf +*.blg +*.bbl +*.fls +*.fdb_latexmk +*.log +*.out +*.aux +*.swp diff --git a/summary/src/cat/img/aws.pdf b/summary/src/cat/img/aws.pdf new file mode 100644 index 0000000..0c3161e Binary files /dev/null and b/summary/src/cat/img/aws.pdf differ diff --git a/summary/src/cat/img/bt10.pdf b/summary/src/cat/img/bt10.pdf new file mode 100644 index 0000000..23fbd5c Binary files /dev/null and b/summary/src/cat/img/bt10.pdf differ diff --git a/summary/src/cat/img/bt100.pdf b/summary/src/cat/img/bt100.pdf new file mode 100644 index 0000000..b37c523 Binary files /dev/null and b/summary/src/cat/img/bt100.pdf differ diff --git a/summary/src/cat/img/bt1000.pdf b/summary/src/cat/img/bt1000.pdf new file mode 100644 index 0000000..65e51f8 Binary files /dev/null and b/summary/src/cat/img/bt1000.pdf differ diff --git a/summary/src/cat/img/df_nodef.pdf b/summary/src/cat/img/df_nodef.pdf new file mode 100644 index 0000000..f72a9d0 Binary files /dev/null and b/summary/src/cat/img/df_nodef.pdf differ diff --git a/summary/src/cat/img/df_wt.pdf b/summary/src/cat/img/df_wt.pdf new file mode 100644 index 0000000..14a18b0 Binary files /dev/null and b/summary/src/cat/img/df_wt.pdf differ diff --git a/summary/src/cat/img/df_wtfpad.pdf b/summary/src/cat/img/df_wtfpad.pdf new file mode 100644 index 0000000..e425ae8 Binary files /dev/null and b/summary/src/cat/img/df_wtfpad.pdf differ diff --git a/summary/src/cat/img/dns__classifier-idea.pdf b/summary/src/cat/img/dns__classifier-idea.pdf new file mode 100644 index 0000000..bf68fff Binary files /dev/null and b/summary/src/cat/img/dns__classifier-idea.pdf differ diff --git a/summary/src/cat/img/dns__timing-dist.pdf b/summary/src/cat/img/dns__timing-dist.pdf new file mode 100644 index 0000000..eca8a0a Binary files /dev/null and b/summary/src/cat/img/dns__timing-dist.pdf differ diff --git a/summary/src/cat/img/dynaflow_config1.pdf b/summary/src/cat/img/dynaflow_config1.pdf new file mode 100644 index 0000000..de82dc4 Binary files /dev/null and b/summary/src/cat/img/dynaflow_config1.pdf differ diff --git a/summary/src/cat/img/dynaflow_config2.pdf b/summary/src/cat/img/dynaflow_config2.pdf new file mode 100644 index 0000000..e995173 Binary files /dev/null and b/summary/src/cat/img/dynaflow_config2.pdf differ diff --git a/summary/src/cat/img/dynaflow_nodef.pdf b/summary/src/cat/img/dynaflow_nodef.pdf new file mode 100644 index 0000000..c481a3d Binary files /dev/null and b/summary/src/cat/img/dynaflow_nodef.pdf differ diff --git a/summary/src/cat/img/factor-fnp.pdf b/summary/src/cat/img/factor-fnp.pdf new file mode 100644 index 0000000..933d1ce Binary files /dev/null and b/summary/src/cat/img/factor-fnp.pdf differ diff --git a/summary/src/cat/img/factor-recall.pdf b/summary/src/cat/img/factor-recall.pdf new file mode 100644 index 0000000..2e017db Binary files /dev/null and b/summary/src/cat/img/factor-recall.pdf differ diff --git a/summary/src/cat/img/probfp.pdf b/summary/src/cat/img/probfp.pdf new file mode 100644 index 0000000..81e8de8 Binary files /dev/null and b/summary/src/cat/img/probfp.pdf differ diff --git a/summary/src/cat/img/setting-oracle.pdf b/summary/src/cat/img/setting-oracle.pdf new file mode 100644 index 0000000..4620d67 Binary files /dev/null and b/summary/src/cat/img/setting-oracle.pdf differ diff --git a/summary/src/cat/img/setting.pdf b/summary/src/cat/img/setting.pdf new file mode 100644 index 0000000..1004bf1 Binary files /dev/null and b/summary/src/cat/img/setting.pdf differ diff --git a/summary/src/cat/img/timeuntilvisited.pdf b/summary/src/cat/img/timeuntilvisited.pdf new file mode 100644 index 0000000..188746b Binary files /dev/null and b/summary/src/cat/img/timeuntilvisited.pdf differ diff --git a/summary/src/cat/img/wang_csbuflo.pdf b/summary/src/cat/img/wang_csbuflo.pdf new file mode 100644 index 0000000..0ec042b Binary files /dev/null and b/summary/src/cat/img/wang_csbuflo.pdf differ diff --git a/summary/src/cat/img/wang_nodef.pdf b/summary/src/cat/img/wang_nodef.pdf new file mode 100644 index 0000000..7dd0023 Binary files /dev/null and b/summary/src/cat/img/wang_nodef.pdf differ diff --git a/summary/src/cat/img/wang_tamaraw.pdf b/summary/src/cat/img/wang_tamaraw.pdf new file mode 100644 index 0000000..7f902ff Binary files /dev/null and b/summary/src/cat/img/wang_tamaraw.pdf differ diff --git a/summary/src/cat/main.tex b/summary/src/cat/main.tex new file mode 100644 index 0000000..5dd9d84 --- /dev/null +++ b/summary/src/cat/main.tex @@ -0,0 +1,70 @@ +\begin{kaupaper}[ + author={% + Tobias Pulls and \textbf{Rasmus Dahlberg} + }, + title={% + Website Fingerprinting with Website Oracles + }, + reference={% + PETS (2020) + }, + summary={% + One of the properties Tor aims to provide against local network attackers + is unlinkability between end-users (sender anonymity set) and their + destinations on the Internet (receiver anonymity set). A website + fingerprinting attack aims to break anonymity in this model by inferring + which website an identifiable end-user is visiting based only on the + traffic entering the Tor network. We extend the attacker model for + website fingerprinting attacks by introducing the notion of \emph{website + oracles}. A website oracle answers the following question: was website $w$ + visited during time frame $t$? In other words, the attacker can query the + receiver anonymity set for websites that were (not) visited. Our + simulations show that augmenting past website fingerprinting attacks to + include website oracles significantly reduces false positives for all but + the most popular websites, e.g., to the order of $10^{-6}$ for + classifications around Alexa top-10k and much less for the long tail of + sites. Further, some earlier website fingerprinting defenses are largely + ineffective in the (stronger) attacker model that includes website + oracles. We discuss a dozen real-world website oracles ranging from + centralized access logs to widely accessible real-time bidding platforms + and DNS caches, arguing that they are inherent parts of the protocols used + to perform website visits. Therefore, access to a website oracle should + be an assumed attacker capability when evaluating which website + fingerprinting defenses are effective. + }, + participation={\vspace{-.25cm} + Tobias is the main author and conducted most of the work. I mainly + contributed by coining the name \emph{website oracle}, evaluating + sources of real-world website oracles, and performing our non-simulated + network experiments. + }, + label={ + paper:cat + }, +] + \maketitle + \begin{abstract} + \input{src/cat/src/abstract} + \end{abstract} + + \input{src/cat/src/intro} + \input{src/cat/src/background} + \input{src/cat/src/oracles} + \input{src/cat/src/sources} + \input{src/cat/src/sim} + \input{src/cat/src/wf} + \input{src/cat/src/discussion} + \input{src/cat/src/related} + \input{src/cat/src/conclusions} + \input{src/cat/src/ack} + + \bibliographystyle{plain} + \bibliography{src/cat/src/ref-min} + + \begin{appendices} + \input{src/cat/src/bayes} + \input{src/cat/src/lessons} + \input{src/cat/src/othersources} + \end{appendices} + +\end{kaupaper} diff --git a/summary/src/cat/src/abstract.tex b/summary/src/cat/src/abstract.tex new file mode 100644 index 0000000..da09599 --- /dev/null +++ b/summary/src/cat/src/abstract.tex @@ -0,0 +1,25 @@ +\noindent +Website Fingerprinting (WF) attacks are a subset of traffic analysis attacks +where a local passive attacker attempts to infer which websites a target victim +is visiting over an encrypted tunnel, such as the anonymity network Tor. We +introduce the security notion of a \emph{Website Oracle} (WO) that gives a WF +attacker the capability to determine whether a particular monitored website was +among the websites visited by Tor clients at the time of a victim's trace. Our +simulations show that combining a WO with a WF attack---which we refer to as a +WF+WO attack---significantly reduces false positives for about half of all +website visits and for the vast majority of websites visited over Tor. The +measured false positive rate is on the order one false positive per million +classified website trace for websites around Alexa rank 10,000. Less popular +monitored websites show orders of magnitude lower false positive rates. + +{\setlength{\parindent}{6mm} We argue that WOs are inherent to the setting of +anonymity networks and should be an assumed capability of attackers when +assessing WF attacks and defenses. Sources of WOs are abundant and available to +a wide range of realistic attackers, e.g., due to the use of DNS, OCSP, and +real-time bidding for online advertisement on the Internet, as well as the +abundance of middleboxes and access logs. Access to a WO indicates that the +evaluation of WF defenses in the open world should focus on the highest possible +recall an attacker can achieve. Our simulations show that augmenting the Deep +Fingerprinting WF attack by Sirinam \emph{et~al.}~\cite{DF} with access to a WO +significantly improves the attack against five state-of-the-art WF defenses, +rendering some of them largely ineffective in this new WF+WO setting.} diff --git a/summary/src/cat/src/ack.tex b/summary/src/cat/src/ack.tex new file mode 100644 index 0000000..4008faf --- /dev/null +++ b/summary/src/cat/src/ack.tex @@ -0,0 +1,9 @@ +\section*{Acknowledgements} +We would like to thank Jari Appelgren, Roger Dingledine, Nicholas Hopper, Marc +Juarez, George Kadianakis, Linus Nordberg, Mike Perry, Erik Wästlund, and the +PETS reviewers for their valuable feedback. Simulations were performed using the +Swedish National Infrastructure for Computing (SNIC) at +High Performance Computing Center North (HPC2N) +This research was funded by the +Swedish Internet Foundation and the +Knowledge Foundation of Sweden. diff --git a/summary/src/cat/src/background.tex b/summary/src/cat/src/background.tex new file mode 100644 index 0000000..bb64337 --- /dev/null +++ b/summary/src/cat/src/background.tex @@ -0,0 +1,211 @@ +\section{Background} \label{cat:sec:back} +Here we present background on terminology, the anonymity network Tor, +and WF attacks and defenses. + +\subsection{Anonymity and Unobservability} +Anonymity is the state of a subject not being identifiable from an attackers +perspective within the \emph{anonymity set} of possible subjects that performed +an action such as sending or receiving a message~\cite{anonterm}. For an +anonymity network, an attacker may not be able to determine who sent a message +into the network---providing a sender anonymity set of all possible +senders---and conversely, not be able to determine the recipient of a message +from the network out of all possible recipients in the recipient anonymity set. +Inherent for anonymity is that the subjects in an anonymity set change based on +what the attacker observes, e.g., when some subjects send or receive +messages~\cite{KedoganAP02,Raymond00}. In gist, anonymity is concerned with +hiding the \emph{relationship} between a sender and recipient, not its +existence. + +Unobservability is a strictly stronger notion than +anonymity~\cite{KedoganAP02,anonterm,Raymond00}. In addition to anonymity of the +relationship between a sender and recipient, unobservability also requires that +an attacker (not acting as either the sender or recipient) cannot sufficiently +distinguish if there is a sender or recipient or not~\cite{anonterm}. Perfect +unobservability is therefore the state of an attacker being unable to determine +if a sender/recipient should be part of the anonymity set or not. + +\subsection{Tor} +Tor is a low-latency anonymity network for anonymising TCP streams with about +eight million daily users, primarily used for anonymous browsing, censorship +circumvention, and providing anonymous (onion) services~\cite{tor,torusage}. +Because Tor is designed to be usable for low-latency tasks such as web browsing, +its threat model and design does not consider powerful attackers, e.g., global +passive adversaries that can observe all network traffic on the +Internet~\cite{trilemma,tor}. However, less powerful attackers such as ISPs and +ASes that observe a fraction of network traffic on the Internet are in scope. + +Users typically use Tor Browser---a customised version of Mozilla Firefox +(bundled with a local relay)---as a client that sends traffic through three +\emph{relays} when browsing a website on the regular Internet: a guard, middle, +and exit relay. Traffic from the client to the exit is encrypted in multiple +layers as part of fixed-size cells such that only the guard relay knows the +IP-address of the client and only the exit relay knows the destination website. +There are about 7000 public relays at the time of writing, all available in the +\emph{consensus} generated periodically by the network. The consensus is public +and therefore anyone can trivially determine if traffic is coming from the Tor +network by checking if the IP-address is in the consensus. Note that the +encrypted network traffic in Tor is exposed to network adversaries as well as +relays as it traverses the Internet. Figure~\ref{cat:fig:setting} depicts the +setting just described, highlighting the anonymity sets of users of Tor Browser +and the possible destination websites. + +\begin{figure}[!t] + \centering + \includegraphics[width=.85\columnwidth]{src/cat/img/setting} + \caption{Using Tor to browse to a website, where an attacker observes the encrypted traffic into the Tor network for a target user, attempting to determine the website the user is visiting.} + \label{cat:fig:setting} +\end{figure} + +\subsection{Website Fingerprinting} +\label{cat:sec:back:wf} +As mentioned in the introduction, attacks that analyse the encrypted network +traffic (a trace) between a Tor client and a guard relay with the goal to detect +the website a client is visiting are referred to as \emph{website +fingerprinting} (WF) attacks. Figure~\ref{cat:fig:setting} shows the typical +location of the attacker, who can also be the guard itself. WF attacks are +evaluated in either the \emph{closed} or the \emph{open} world. In the closed +world, an attacker \emph{monitors} a number of websites and it is the goal of +the attacker to determine which website out of all the possible monitored +websites a target is visiting. The open world is like the closed world with one +significant change: the target user may also visit \emph{unmonitored} websites. +This means that in the open world the attacker may also classify a trace as +unmonitored in addition to monitored, posing a significantly greater challenge +for the attacker in a more realistic setting than the closed world. The ratio +between monitored and unmonitored traces in a dataset is further a significant +challenge for WF attacks when assessing their real-world significance for +Tor~\cite{DBLP:conf/ccs/JuarezAADG14}. Typically, WF attacks are evaluated on +the frontpages of web\emph{sites}: web\emph{page} fingerprinting is presumably +much more challenging due to the orders of magnitude of more webpages than +websites. Unless otherwise stated, we only consider the frontpages of websites +in this paper. + +\subsubsection{Website Fingerprinting Attacks} +Prior to WF attacks being considered for use on Tor, they were used against +HTTPS~\cite{cheng1998traffic}, web +proxies~\cite{Hintz02,DBLP:conf/sp/SunSWRPQ02}, SSH +tunnels~\cite{DBLP:conf/ccs/LiberatoreL06}, and VPNs~\cite{HerrmannWF09}. For Tor, WF attacks are typically based on machine learning and can be +categorized based on if they use deep learning or not. + +Traditional WF attacks in the literature use manually engineered features +extracted from both the size and timing of packets (and/or cells) sent by Tor. +State of the art attacks with manually engineered features are +Wang-kNN~\cite{Wang}, CUMUL~\cite{cumul}, and k-FP~\cite{kfp}. For reference, +Wang-kNN has 1225 features, CUMUL 104 features, and k-FP 125 features. In terms +of accuracy, k-FP appears to have a slight edge over the other two, but all +three report over 90\% accuracy against significantly sized datasets. As +traditional WF attacks progressed, the features more than the type of machine +learning method have shown to be vital for the success of attacks, with an +emerging consensus on what are important features (e.g., coarse features like +number of incoming and outgoing packets)~\cite{Cherubin17,kfp,cumul}. + +Deep learning was first used for WF attacks by Abe and Goto in +2016~\cite{abe2016fingerprinting}. Relatively quickly, Rimmer \emph{et~al.} +reached parity with traditional WF attacks, lending credence to the emerging +consensus that the research community had found the most important features for +WF~\cite{DBLP:conf/ndss/RimmerPJGJ18}. However, recently Sirinam et +al.~\cite{DF} with Deep Fingerprinting (DF) significantly improved on other WF +attacks, also on the WTF-PAD and Walkie-Talkie defenses, and is at the time of +writing considered state-of-the-art. DF is based on a Convolutional Neural +Network (CNN) with a customized architecture for WF. Each packet trace as input +to DF is simply a constant size (5000) list of cells (or packets) and their +direction (positive for outgoing, negative for incoming), ignoring size and +timings. Based on the input, the CNN learns features on its own: we do not know +what they are, other than preliminary work indicating that the CNN gives more +weight to input early in the trace~\cite{mathews2018understanding}. + +The last layer of the CNN-based architecture of DF is a \emph{softmax} function: +it assigns (relative) probabilities to each class as the output of +classification. These probabilities allow a threshold to be defined for the +final classification in the open world, requiring that the probability of the +most likely class is above the threshold to classify as a monitored website. + +\subsubsection{Website Fingerprinting Defenses} +WF defenses for Tor modify the timing and number of (fixed-size) cells sent over +Tor when a website is visited. The modifications are done by injecting dummy +traffic and introducing artificial delays. Defenses can typically be classified +as either based on constant-rate traffic or not, where constant rate defenses +force all traffic to fit a pre-determined structure, forming \emph{collision +sets} for websites where their traffic traces appear identical to an attacker. +Non-constant rate defenses simply more-or-less randomly inject dummy traffic +and/or artificial delays with the hope of obfuscating the resulting network +traces. WF defenses are typically compared in terms of their induced +\emph{bandwidth} (BOH) and \emph{time} (TOH) overheads compared to no defense. +Further, different WF defenses make more or less realistic and/or practical +assumptions, making comparing overheads necessary but not nearly sufficient for +reaching conclusions. + +We briefly describe WF defenses that we later use to evaluate the impact of +attackers performing enhanced WF attacks with access to WOs: +\begin{description} + \item[Walkie-Talkie] by Wang and Goldberg~\cite{WT} puts Tor Browser into + half duplex mode and pads traffic such that different websites result in the + same cell sequences. This creates a collision set between a visited website + and a target \emph{decoy website} which results in the same cell sequence + with the defense. Their evaluation shows 31\% BOH and 34\% TOH. Collision + sets grow beyond size two at the cost of BOH. + \item[WTF-PAD] by Juarez \emph{et~al.}~\cite{wtf-pad} is based on the idea + of \emph{adaptive padding} \cite{DBLP:conf/esorics/ShmatikovW06} where fake + padding is injected only when there is no real traffic to send. The defense + is simulated on collected packet traces and its design is the foundation of + the circuit padding framework recently implemented in Tor. The simulations + report 50-60\% BOH and 0\% TOH. + \item[CS-BuFLO] by Cai \emph{et~al.}~\cite{csbuflo} is a \emph{constant rate} + defense where traffic is always sent at a constant rate between a sender and + receiver, improving on prior work by Dyer~et + al.~\cite{DBLP:conf/sp/DyerCRS12}. Their evaluation shows 220-270\% BOH and + 270-340\% TOH. + \item[Tamaraw] by Cai \emph{et~al.}~\cite{Tamaraw} is another constant rate defense + that further improves on CS-BuFLO. In the evaluation by Wang and Goldberg, + they report 103\% BOH and 140\% TOH for Tamaraw~\cite{WT}. + \item[DynaFlow] by Lu \emph{et~al.}~\cite{DynaFlow} is a \emph{dynamic} + constant-rate defense that allows for the defense to adjust its parameters + (notably the ``inter-packet interval'') based on configuration and on the + observed traffic. The evaluation shows an overall improvement over Tamaraw + when configured to use similar overheads. +\end{description} +The primary downside of defenses like Walkie-Talkie that depend on creating +collision sets for websites is that they require up-to-date knowledge of the +target website(s) to create collisions with (to know how to morph the traffic +traces): this is a significant practical issue for +deployment~\cite{DBLP:conf/wpes/NithyanandCJ14,Wang,WT}. Constant rate defenses +like CS-BuFLO and Tamaraw are easier to deploy but suffer from significant +overheads~\cite{csbuflo,Tamaraw}. WTF-PAD is hard to implement both efficiently +and effectively in practice due to only being simulated on packet traces as-is +and also being vulnerable to attacks like Deep Fingerprinting~\cite{wtf-pad,DF}. +While DynaFlow shows great promise, but requires changes at the client (Tor +Browser, local relay, or both) and at exit relays to \emph{combine} packets with +payloads smaller than Tor's cell size~\cite{DynaFlow}. Without combined packets +its advantage in terms of overhead compared to Tamaraw likely shrinks. + +\subsection{Challenges for WF Attacks in Practice} +A number of practical challenges for an attacker performing WF attacks have been +highlighted over the years, notably comprehensively so by Mike Perry of the Tor +Project~\cite{perryCrit} and Juarez et +al.~\cite{DBLP:conf/ccs/JuarezAADG14}. Wang and Goldberg have showed that +several of the highlighted challenges---such as maintaining a fresh data set and +determining when websites are visited---are practical to overcome~\cite{DBLP:journals/popets/WangG16}. What remains are two notably +significant challenges: distinguishing between different goals of the attacker +and addressing false positives. + + +For attacker goals when performing WF attacks, an attacker may want to detect +website visits with the goal of censoring access to it, to identify all users +that visit particular websites, or to identify every single website visited by +a target~\cite{perryCrit}. Clearly, these goals put different +constraints on the attacker. For censorship, classification must happen before +content is actually allowed to be completely transferred to the victim. For +monitoring only a select number of websites the attacker has the most freedom, +while detecting all website visits by a victim requires the attacker to have +knowledge of all possible websites on the web. + +For addressing false positives there are a number of aspects to take into +account. First, the web has millions of websites that could be visited by a +victim (not the case for onion services~\cite{JansenJGED18}), and each website +has a significant number of webpages that are often dynamically generated and +frequently changed~\cite{DBLP:conf/ccs/JuarezAADG14,perryCrit}. Secondly, how +often victims potentially visit websites that are monitored by an attacker is +unknown to the attacker, i.e., the \emph{base rate} of victims are unknown. The +base rate leads to even a small false positive rate of a WF attack overwhelming +an attacker with orders of magnitude more false positives than true positives, +leaving WF attacks impractical for most attacker goals in practice. + diff --git a/summary/src/cat/src/bayes.tex b/summary/src/cat/src/bayes.tex new file mode 100644 index 0000000..344d712 --- /dev/null +++ b/summary/src/cat/src/bayes.tex @@ -0,0 +1,96 @@ +\section{Bayes' Law for Estimating Utility of Website Oracles} \label{cat:app:bayes} + +To reason about the advantage to an attacker of having access to a WO, we +estimate the conditional probability of a target user visiting a monitored +website. For conditional probability we know that: +\begin{equation} + P(C_0 \cap C_1) = P(C_0|C_1)P(C_1) +\end{equation} + +For a hypothesis $H$ given conditional evidence $E$, Bayes' theorem states +that: +\begin{equation} + P(H|E) = \frac{P(E|H) P(H)}{P(E)} +\end{equation} + +Assume that E = $E_0 \cap E_1$, then: +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(E_0 \cap E_1|H) P(H)}{P(E_0 \cap E_1)} +\end{equation} + +Substituting $P(E_0 \cap E_1)$ with (1) we get: +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(E_0 \cap E_1|H) P(H)}{P(E_0|E_1)P(E_1)} +\end{equation} + +For a timeframe $t$, we define +\begin{description} + \item[$H$] the probability that target user(s) visited website $w$ over Tor in $t$ + \item[$E_0$] the probability that target user(s) visited a website over Tor in $t$ + \item[$E_1$] the probability that someone visited website $w$ over Tor in $t$ +\end{description} + +We see that $P(E_0 \cap E_1|H) = 1$ by definition and get: +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(H)}{P(E_0|E_1)P(E_1)} +\end{equation} + +Consider $P(E_0|E_1)$: while the conditional $E_1$ may have some minor affect on +user behaviour (in particular for overall popular websites), we assume that the +popularity of using Tor to visit a particular website (by any of the users of +Tor) has negligible impact on $E_0$ and treat $E_0$ and $E_1$ as independent: + +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(H)}{P(E_0)P(E_1)} +\end{equation} + +We can further refine $P(H)$ as being composed of at least: + +\begin{equation} + P(H) = P(E_0) \cap P(B_w) = P(E_0|B_w)P(B_w) +\end{equation} + +Where $P(B_w)$ is the base rate (prior) of the user(s) visiting website $w$ out +of all possible websites they visit ($P(E_0)$). We again assume (perhaps +naively) that $E_0$ is also independent of $B_w$, which gives us: + +\begin{equation} + P(H|E_0 \cap E_1) = \frac{P(E_0)P(B_w)}{P(E_0)P(E_1)} = \frac{P(B_w)}{P(E_1)} +\end{equation} + +In other words, if an attacker learns that target user(s) visited a website +($E_0$) over Tor and that website $w$ was also visited over Tor by some user +($E_1$), then we can estimate the probability that it was target user(s) that +visited website $w$ ($H$) as the ratio between the base rate (prior) for +visiting $w$ of target user(s) ($B_w$) and the probability that someone visited +the website over Tor ($E_1$), all within a timeframe $t$. + +Figure~\ref{cat:fig:bt} shows the results for simulating the probability $P(H|E_0 +\cap E_1)$ for different website popularities of $w$, base rates, and +timeframes. We see that with a realistic timeframe of $100$ ms, for all +base-rates but $10^{-6}$ there is non-zero conditional probability (and +therefore utility of WO access) for Alexa top 100k or less popular websites, +which covers about half of all website visits over Tor (excluding +\url{torproject.org}). + +\begin{figure}[!t] + \centering + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/bt10} + \caption{10 ms} + \label{cat:fig:bt:10ms} + \end{subfigure} + \begin{subfigure}[b]{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/bt100} + \caption{100 ms} + \label{cat:fig:bt:100ms} + \end{subfigure} + \begin{subfigure}[b]{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/bt1000} + \caption{1000 ms} + \label{cat:fig:bt:1000ms} + \end{subfigure} + \caption{The conditional probability as a function of user base rate and + website popularity (Alexa) for three different timeframes.} + \label{cat:fig:bt} +\end{figure} diff --git a/summary/src/cat/src/conclusions.tex b/summary/src/cat/src/conclusions.tex new file mode 100644 index 0000000..8554d23 --- /dev/null +++ b/summary/src/cat/src/conclusions.tex @@ -0,0 +1,25 @@ +\section{Conclusions} \label{cat:sec:conc} +WF+WO attacks use the base rate of all users of the network against victims, +significantly reducing false positives in the case of all but the most popular +websites visited over Tor. This is troubling in many ways, in part because +presumably many sensitive website visits are to unpopular websites used only by +local communities in regions of the world where the potential consequences of +identification are the worst. + +The threat model of Tor explicitly states that Tor does not consider attackers +that can observe both incoming and outgoing traffic~\cite{tor}. Clearly, a WO +gives the capability to infer what the outgoing traffic of the network encodes +on the application layer (the website visits). This is in a sense a violation of +Tor's threat model when combined with a WF attacker that also observes incoming +traffic. However, we argue that because of the plethora of possible ways for an +attacker to infer membership in the anonymity sets of Tor, WOs should be +considered within scope simply because Tor asserts that it is an anonymity +network. + +While the real-world impact of WF attacks on Tor users remains an open question, +our simulations show that false positives can be signficantly reduced by many +attackers with little extra effort for some WO sources. Depending on WO source, +this comes at a trade-off of less recall. For many attackers and attacker goals, +however, this is a worthwhile trade. To us, the threat of WF attacks appears +more real than ever, especially when also considering recent advances by deep +learning based attacks like DF~\cite{DF} and DeepCorr~\cite{deepcorr}. diff --git a/summary/src/cat/src/discussion.tex b/summary/src/cat/src/discussion.tex new file mode 100644 index 0000000..0de8584 --- /dev/null +++ b/summary/src/cat/src/discussion.tex @@ -0,0 +1,153 @@ +\section{Discussion} \label{cat:sec:disc} +For defenses that are based on the idea of creating collision sets between +packet traces generated by websites, oracle access is equivalent to being able +to perform set intersection between the set of websites in a collision set and +monitored websites visited at the time of fingerprinting. As the results show +from Section~\ref{cat:sec:wf}, some defenses can significantly reduce the recall of +WF attacks with WOs, but not the precision for the majority of websites and +website visits in Tor. + +Next, in Section~\ref{cat:sec:disc:unmon} we further cement that our simulations +show that WOs significantly reduces false positives, highlighting that a WF+WO +attacker surprisingly infrequently have to query a WO when classifying +unmonitored traces. Section~\ref{cat:sec:disc:flawed} discusses the impact of +imperfect WO sources with limited observability and false positives on the joint +WF+WO attack. Finally, Section~\ref{cat:sec:disc:limits} covers limitations of our +work, and Section~\ref{cat:sec:disc:miti} discusses possible mitigations. + +\subsection{A Large Unmonitored Dataset} +\label{cat:sec:disc:unmon} +We look at the number of false positives for a large test dataset consisting of +only unmonitored websites (representing a target user(s) with base rate 0, i.e., +that never visits any monitored website). Using the dataset of Greschbach et +al.~\cite{DBLP:conf/ndss/GreschbachPRWF17}, we trained DF on 100x100 monitored +and 10k unmonitored websites (8:2 stratified split for validation), resulting in +about 80\% validation accuracy after 30 epochs (so DF is clearly successful also +on this dataset). We then tested the trained classifier on \emph{only} 100k +\emph{unmonitored} traces, with and without oracle access (100ms resolution) for +different assumptions of the popularity of the \emph{monitored} websites. With +ten repetitions of the above experiment, we observed a false positive rate in +the order of $10^{-6}$ for monitored websites with Alexa popularity 10k. +Excluding \url{torproject.org}, this indicates that an attacker would have close +to no false positives for about half of all website visits in Tor, according to +the distribution of Mani \emph{et~al.}~\cite{torusage} (see +Section~\ref{cat:sec:sim:dist}). Without access to a WO, DF had a false positive +rate in the order of $10^{-3}$ to $10^{-4}$, depending on the threshold used by +the attacker. + +Recall how WOs are used as part of WF+WO attacks in +Section~\ref{cat:sec:oracles:generic}: the WO is only used if the WF attack +classifies a trace as monitored.\footnote{For WF attacks like DF that produces a +list of probabilities (Definition~\ref{cat:def:oracleprob}), just assume that the +attacker picks the threshold and checks if the probability is above as part of +the if-statement before using the WO.} This means that, in the example above, +the WO is used on average every $10^3$ to $10^4$ trace, to (potentially) rule +out a false positive. Clearly, this means that WO sources that can only be used +infrequently, e.g., due to caching as in DNS, are still valuable for an +attacker. + +\subsection{Imperfect Website Oracle Sources} +\label{cat:sec:disc:flawed} +Our analysis considered an ideal source of a WO that observes all visits to +targeted \emph{monitored} websites of the attacker and that produces no false +positives. Next, using the same dataset and setup as for +Figure~\ref{cat:fig:df:nodef} with an Alexa starting rank of $10^4$, we simulate the +impact on recall and the False-Negative-to-Positive-rate\footnote{For a +classifier with multiple monitored classes and an unmonitored class (as for our +modified DF, see Section~\ref{cat:sec:wf:aug}), FNP captures the case when the +classifier classifies an unmonitored testing trace as any monitored class. In +addition to FNP, such a classifier can also confuse one monitored website for +another. Both these cases are false positives~\cite{Wang2015a}.} (FNP) of the +\emph{joint} WF+WO attack for five false positive rates \emph{of the WO} and a +fraction of observed website visits. + +Figure~\ref{cat:fig:factor-recall} shows the impact on the joint recall in the above +setting. We see that recall is directly proportional to the fraction of observed +visits, as per the results of Greschbach +\emph{et~al.}~\cite{DBLP:conf/ndss/GreschbachPRWF17}. Further, false positives +for the WO have a positive impact on the fraction of recall, counteracting +visits missed due to limited observability. For the same reason, a larger +timeframe or monitoring more popular websites would also improve recall. + +\begin{figure}[!t] + \centering + \includegraphics[width=.67\columnwidth]{src/cat/img/factor-recall} + \caption{How limited WO observability effects the final recall of a WF+WO attack for five different WO false positive rates.} + \label{cat:fig:factor-recall} +\end{figure} + +Figure~\ref{cat:fig:factor-fnp} shows the impact on the joint FNP. Note that lower +FNP is better for an attacker. We see that limited observability has no impact +on FNP. This makes sense, because a WO cannot confirm anything it does not +observe. The FNP fraction for the joint FNP is roughly proportional to the FP of +the WO. We also see that the FNP fraction consistently is above the FP of the +WO: this is because---beyond the simulated FP---there is a slight probability +that someone else (in our simulation of the Tor network) visited the website for +each classified trace. A larger timeframe or monitoring more popular websites +would also increase FNP. + +\begin{figure}[!t] + \centering + \includegraphics[width=.67\columnwidth]{src/cat/img/factor-fnp} + \caption{How limited WO observability effects the final False-Negative-to-Positive-rate (FNP) of a WF+WO attack for five different WO false positive rates. Lower is better.} + \label{cat:fig:factor-fnp} +\end{figure} + +From above results, our simulations indicate that even with a deeply imperfect +WO source an attacker can get significant advantage in terms of reduced false +positives at a comparatively small cost of recall. For example, given a WO with +50\% observability and false positives, the resulting WF+WO attack has about +75\% of the recall of the WF attack and slightly more than half the false +positives. + +\subsection{Limitations} +\label{cat:sec:disc:limits} +As discussed in Section~\ref{cat:sec:back:wf}, there are a number of practical +limitations in general for WF attacks. Regarding attacker goals, WOs are likely +less useful for the purpose of censorship than for other goals. Many sources of +WOs cannot be accessed in real-time, giving little utility for an attacker that +needs to make a near real-time censorship decision. An attacker that only wants +to detect visits to a few selected monitored websites gains significant utility +from WOs, as long as the detection does not have to be in real-time. It is also +noteworthy that an attacker that wants to detect all possible website visits by +a victim can use the WO to in essence ``close the world'' from all possible +websites to only those visited over Tor while the victim is actively browsing. +Granted, this requires a source for the WO that is slightly different from our +definition, but some do offer this: e.g., an attacker that gains comprehensive +control over the DNS resolvers used by Tor +exits~\cite{DBLP:conf/ndss/GreschbachPRWF17}. + +When it comes to false positives a significant limitation of our simulations is +that we consider fingerprinting the frontpages of websites and not specific +webpages. Several sources or WOs are not able to detect webpage visits. This is +also true for subsequent webpage visits on the same website after first visiting +the frontpage of a website (e.g., DNS and OCSP will be cached). An attacker with +the goal of detecting each such page visit will thus suffer more false positives +or fail at detecting them for some sources of WOs. + +\subsection{Mitigations} +\label{cat:sec:disc:miti} +The best defense against WOs is WF defenses that significantly reduce the recall +of WF attacks. In particular, if an attacker can significantly reduce the +website anonymity set \emph{after} accounting for information from the WO, then +attacks are likelier to succeed. This implies that most websites need to (at +least have the potential to) result in the same network traces, as we see with +DynaFlow, Tamaraw, and CS-BuFLO. + +For onion websites we note that the DHT source of a WO from +Section~\ref{cat:sec:sources} is inherent to the design of onion services in Tor. +Defenses that try to make it harder to distinguish between regular website +visits and visits to onion websites should also consider this WO source as part +of their analysis, in particular for v2 onion services. + +Finally, some sources of WOs could be minimized. If you run a potentially +sensitive website: do not use RTB ads, staple OCSP, have as few DNS entries as +possible\footnote{As noted by +Greschbach~\emph{et~al.}~\cite{DBLP:conf/ndss/GreschbachPRWF17}, websites may +have several unique domain names. Each of those could be used independently to +query several sources (e.g., DNS) of WOs.} with a high TTL, do not use CDNs, do +not retain any access logs, and consider if your website, web server, or +operating system have any information leaks that can be used as an oracle. If +you run a Tor exit, consider not using Google or Cloudflare for your DNS but +instead use your ISP's resolver if +possible~\cite{DBLP:conf/ndss/GreschbachPRWF17}. diff --git a/summary/src/cat/src/intro.tex b/summary/src/cat/src/intro.tex new file mode 100644 index 0000000..b65baa9 --- /dev/null +++ b/summary/src/cat/src/intro.tex @@ -0,0 +1,122 @@ +\section{Introduction} \label{cat:sec:intro} +A Website Fingerprinting (WF) attack is a type of traffic analysis attack where +an attacker attempts to learn which websites are visited through encrypted +network tunnels---such as the low-latency anonymity network Tor~\cite{tor} or +Virtual Private Networks (VPNs)---by analysing the encrypted network +traffic~\cite{cheng1998traffic,HerrmannWF09,Hintz02,DBLP:conf/ccs/LiberatoreL06,PanchenkoNZE11,DBLP:conf/sp/SunSWRPQ02}. +The analysis considers only the size and timing of encrypted packets sent over +the network to and from a target client. This makes it possible for attackers +that only have the limited \emph{capability} of observing the encrypted network +traffic (sometimes referred to as a \emph{local eavesdropper}) to perform WF +attacks. Sources of such capabilities include ISPs, routers, network interface +cards, WiFi hotspots, and guard relays in the Tor network, among others. Access +to encrypted network traffic is typically not well-protected over the Internet +because it is already in a form that is considered safe to expose to attackers +due to the use of encryption. + +The last decade has seen significant work on improved WF attacks (e.g., +\cite{CaiZJJ12,kfp,DF,Wang}) and defenses (e.g, +\cite{csbuflo,Tamaraw,wtf-pad,DynaFlow}) accompanied by an ongoing debate on the +real-world impact of these attacks justifying the deployment of defenses or not, +in particular surrounding Tor (e.g., +\cite{DBLP:conf/ccs/JuarezAADG14,perryCrit,DBLP:journals/popets/WangG16}). +There are significant real-world challenges for an attacker to successfully +perform WF attacks, such as the sheer size of the web (about 200 million active +websites~\cite{netcraft-survey}), +detecting the beginning of website loads in encrypted network traces, background +traffic, maintaining a realistic and fresh training data set, and dealing with +false positives. + +Compared to most VPN implementations, Tor has some basic but rather ineffective +defenses in place against WF attacks, such as padding packets to a constant size +and randomized HTTP request pipelining~\cite{CaiZJJ12,tor,Wang}. Furthermore, +Tor recently started implementing a framework for circuit padding machines to +make it easier to implement traffic analysis defenses~\cite{tor-0401} +based on adaptive padding~\cite{wtf-pad,DBLP:conf/esorics/ShmatikovW06}. +However, the unclear real-world impact of WF attacks makes deployment of +proposed effective (and often prohibitively costly in terms of bandwidth and/or +latency overheads) WF defenses a complicated topic for researchers to reach +consensus on and the Tor Project to decide upon. + +\subsection{Introducing Website Oracles} +In this paper, we introduce the security notion of a \emph{Website Oracle} (WO) +that can be used by attackers to augment any WF attack. A WO answers ``yes'' or +``no'' to the question ``was a particular website visited over Tor at this point +in time?''. We show through simulation that such a \emph{capability}---access to +a WO---greatly reduces the false positive rate for an attacker attempting to +fingerprint the majority of websites and website visits through the Tor network. +The reduction is to such a great extent that our simulations suggest that false +positives are no longer a significant reason for why WF attacks lack real-world +impact. This is in particular the case for onion services where the estimated +number of websites is a fraction compared to the ``regular'' +web~\cite{JansenJGED18}. + +Our simulations are based on the privacy-preserving network measurement results +of the live Tor network in early 2018 by Mani \emph{et~al.}~\cite{torusage}. +Besides simulating WOs we also identify a significant number of potential +sources of WOs that are available to a wide range of attackers, such as nation +state actors, advertisement networks (including their customers), and operators +of Tor relays. Some particularly practical sources---due to DNS +and how onion services are accessed---can be used by anyone with modest +computing resources. + +We argue that sources of WOs are inherent in Tor due to its design goal of +providing \emph{anonymous} and not \emph{unobservable} communication: observable +anonymity sets are inherent for anonymity~\cite{KedoganAP02,anonterm,Raymond00}, +and a WO can be viewed as simply being able to query for membership in the +destination/recipient anonymity set (the potential websites visited by a Tor +client). The solution to the effectiveness of WF+WO attacks is therefore not to +eliminate all sources---that would be impossible without unobservable +communication~\cite{KedoganAP02,anonterm,Raymond00}---but to assume that an +attacker has WO access when evaluating the effectiveness of WF attacks and +defenses, even for weak attackers like local (passive) eavesdroppers. + +The introduction of a WO in the setting of WF attacks is similar to how +encryption schemes are constructed to be secure in the presence of an attacker +with access to \emph{encryption} and \emph{decryption} oracles (chosen plaintext +and ciphertext attacks, respectively) \cite{GoldwasserM84,NaorY90,RackoffS91}. +This is motivated by the real-world prevalence of such oracles, and the high +impact on security when paired with other weaknesses of the encryption schemes: +e.g., Bleichenbacher~\cite{Bleichenbacher98} padding oracle attacks remain an +issue in modern cryptosystems today despite being discovered about twenty years +ago~\cite{Merget19,RonenGGSWY18}. + +\subsection{Contributions and Structure} +Further background on anonymity, Tor, and WF are presented in +Section~\ref{cat:sec:back}. Section~\ref{cat:sec:oracles} defines a WO and describes two +generic constructions for combining a WO with \emph{any} WF attack. Our generic +constructions are a type of Classify-Verify method by Stolerman +\emph{et~al.}~\cite{stolerman2013classify}, first used in the context of WF +attacks by Juarez \emph{et~al.}~\cite{DBLP:conf/ccs/JuarezAADG14} and later by +Greschbach \emph{et~al.} \cite{DBLP:conf/ndss/GreschbachPRWF17}. +Section~\ref{cat:sec:sources} presents a number of sources of WOs that can be used +by a wide range of attackers. We focus on practical sources based on DNS and +onion service directories in Tor, offering \emph{probabilistic} WOs that anyone +can use with modest resources. We describe how we simulate access to a WO +throughout the rest of the paper in Section~\ref{cat:sec:sim}, based on Tor network +measurement data from Mani \emph{et~al.}~\cite{torusage}. + +Section~\ref{cat:sec:wf} experimentally evaluates the performance of augmenting the +state-of-the-art WF attack Deep Fingerprinting (DF) by Sirinam +\emph{et~al.}~\cite{DF} with WO access using one of our generic constructions. +We show significantly improved classification performance against unprotected +Tor as well as against traces defended with the WF defenses WTF-PAD by Juarez +\emph{et~al.}~\cite{wtf-pad} and Walkie-Talkie by Wang and Goldberg~\cite{WT}, +concluding that the defenses are ineffective in this new setting where an +attacker has access to a WO. Further, we also evaluate DF with WO access against +Wang \emph{et~al.}'s dataset~\cite{Wang} with simulated traces for the +constant-rate WF defenses CS-BuFLO and Tamaraw by Cai et +al.~\cite{csbuflo,Tamaraw}. Our results show that constant-rate defenses are +overall effective defenses but not efficient due to the significant induced +overheads. We then evaluate two configurations of the WF defense DynaFlow by Lu +\emph{et~al.}~\cite{DynaFlow}, observing similar effectiveness as CS-BuFLO but +at lower overheads approaching that of WTF-PAD and Walkie-Talkie. + +In Section~\ref{cat:sec:disc} we discuss our results, focusing on the impact on +false positives with WO access, how imperfect sources for WOs impact WF+WO +attacks, limitations of our work, and possible mitigations. Our simulations +indicate that WF defenses should be evaluated against WF attacks based on how +they minimise \emph{recall}. We present related work in +Section~\ref{cat:sec:related}, including how WF+WO attacks relate to traffic +correlation and confirmation attacks. Section~\ref{cat:sec:conc} briefly concludes +this paper. diff --git a/summary/src/cat/src/lessons.tex b/summary/src/cat/src/lessons.tex new file mode 100644 index 0000000..70f49f3 --- /dev/null +++ b/summary/src/cat/src/lessons.tex @@ -0,0 +1,47 @@ +\section{Lessons from Simulation} \label{cat:app:lessons} +With the ability to simulate access to WOs we can now simulate the entire +website anonymity set for Tor. To get a better understanding of why WOs are so +useful for an attacker performing WF attacks, we look at two results from the +simulation below. + +\subsection{Time Until Website Visited over Tor} +\label{cat:sec:sim:timeuntil} + +Figure~\ref{cat:fig:timeuntil} shows the time until there is a 50\% probability that +a website has been visited over Tor depending on website popularity (Alexa, as +discussed in Section~\ref{cat:sec:sim:dist}). Within ten seconds, we expect that +most of Alexa top 1k has been visited. Recall that this represents about one +third of all website visits over Tor. The less popular websites on Alexa top +one-million represent another third of all visits, quickly approaching hundreds +of seconds between visits. For the remaining third of all website visits we +expect them to be even less frequent. + +\begin{figure}[!t] + \centering + \includegraphics[width=.67\columnwidth]{src/cat/img/timeuntilvisited} + \caption{The simulated time until there is a 50\% probability that a website for different Alexa ranks has been visited over Tor.} + \label{cat:fig:timeuntil} +\end{figure} + +\subsection{Visits Until First False Positive} +\label{cat:sec:sim:fp} +Assume that target user(s) have a base rate of $0$, i.e., they never visit the +attacker's monitored websites. With WO access, we can determine how many +(naively assumed independent) website visits it \emph{at least} takes until +there is a 50\% chance that the attacker's classifier gets a false positive. +This is because if the attacker's website classifier without oracle access +always returns a false positive, then the false positive rate by the WF+WO +attack will be determined by when the WO says that the---incorrectly classified +as monitored---website has been visited. Figure~\ref{cat:fig:probfp} shows the +expected number of visits \emph{by the victim(s)} for different timeframes based +on the popularity of the monitored websites. Note that the attacker per +definition chooses which websites are monitored and can therefore take the +probability of false positives into account. + +\begin{figure}[t] + \centering + \includegraphics[width=.67\columnwidth]{src/cat/img/probfp} + \caption{The number of website visits until there is a 50\% probability + that a website oracle would contribute to a false positive.} + \label{cat:fig:probfp} +\end{figure} diff --git a/summary/src/cat/src/main.tex b/summary/src/cat/src/main.tex new file mode 100644 index 0000000..f12287b --- /dev/null +++ b/summary/src/cat/src/main.tex @@ -0,0 +1,121 @@ +\documentclass[USenglish,oneside,twocolumn]{article} + +\usepackage[utf8]{inputenc}%(only for the pdftex engine) +%\RequirePackage[no-math]{fontspec}%(only for the luatex or the xetex engine) +\usepackage[big]{dgruyter_NEW} +\usepackage{subcaption} +\usepackage[dvipsnames]{xcolor} +\usepackage{mathtools} +\usepackage{amsthm, amssymb} +\usepackage[]{cryptocode} +\usepackage{footmisc} +\hypersetup{ + colorlinks, + citecolor=RedViolet, + linkcolor=RedViolet, + urlcolor=MidnightBlue} + +\theoremstyle{definition} +\newtheorem{definition}{Definition} + +\DOI{foobar} + +\newcommand{\TODO}[1]{\textcolor{red}{TODO:} #1} + +\cclogo{\includegraphics{by-nc-nd.pdf}} + +\begin{document} + + \author*[1]{Tobias Pulls} + \author[2]{Rasmus Dahlberg} + \affil[1]{Karlstad University, E-mail: tobias.pulls@kau.se} + \affil[2]{Karlstad University, E-mail: rasmus.dahlberg@kau.se} + + \title{\huge Website Fingerprinting with Website Oracles} + + \runningtitle{Website Fingerprinting with Website Oracles} + + \begin{abstract} + {\input{src/abstract}} + \end{abstract} + \keywords{website fingerprinting, website oracles, traffic analysis, security model, design} +% \classification[PACS]{} +% \communicated{...} +% \dedication{...} + +\journalname{Proceedings on Privacy Enhancing Technologies} +\DOI{Editor to enter DOI} +\startpage{1} +\received{..} +\revised{..} +\accepted{..} + +\journalyear{..} +\journalvolume{..} +\journalissue{..} + +\maketitle +\eject +\section{Introduction} +\label{cat:sec:intro} +\input{src/intro} + +\section{Background} +\label{cat:sec:back} +\input{src/background} + +\section{Website Oracles} +\label{cat:sec:oracles} +\input{src/oracles} + +\section{Sources of Website Oracles} +\label{cat:sec:sources} +\input{src/sources} + +\section{Simulating Website Oracles} +\label{cat:sec:sim} +\input{src/sim} + +\section{Deep Fingerprinting with Website Oracles} +\label{cat:sec:wf} +\input{src/wf} + +\section{Discussion} +\label{cat:sec:disc} +\input{src/discussion} + +\section{Related Work} +\label{cat:sec:related} +\input{src/related} + +\section{Conclusions} +\label{cat:sec:conc} +\input{src/conclusions} + +\section*{Acknowledgements} +We would like to thank Jari Appelgren, Roger Dingledine, Nicholas Hopper, Marc +Juarez, George Kadianakis, Linus Nordberg, Mike Perry, Erik Wästlund, and the +PETS reviewers for their valuable feedback. Simulations were performed using the +\href{http://snic.se/}{Swedish National Infrastructure for Computing} (SNIC) at +\href{https://www.hpc2n.umu.se/}{High Performance Computing Center North} +(HPC2N). This research was funded by the +\href{https://internetstiftelsen.se/en/}{Swedish Internet Foundation} and the +\href{www.kks.se}{Knowledge Foundation of Sweden}. + +\bibliographystyle{abbrv} +\bibliography{ref-min} + +\appendix +\section{Bayes' Law for Estimating Utility of Website Oracles} +\label{cat:app:bayes} +\input{src/bayes} + +\section{Lessons from Simulation} +\label{cat:app:lessons} +\input{src/lessons} + +\section{Sources of Website Oracles} +\label{cat:app:sources} +\input{src/othersources} + +\end{document} diff --git a/summary/src/cat/src/oracles.tex b/summary/src/cat/src/oracles.tex new file mode 100644 index 0000000..01f1d99 --- /dev/null +++ b/summary/src/cat/src/oracles.tex @@ -0,0 +1,126 @@ +\section{Website Oracles} \label{cat:sec:oracles} +We first define a WO and then present two generic constructions for use with WF +attacks based on the kind of output the WF attack supports. + +\subsection{Defining Website Oracles} +\begin{definition} + \label{cat:def:oracle} + A website oracle answers true or false to the question ``was a particular + monitored website $w$ visited over the Tor network at time $t$?''. +\end{definition} + +A WO considers only web\emph{sites} and not web\emph{pages} for $w$, but note +that even for webpage fingerprinting being able to narrow down the possible +websites that webpages belong to through WO access is a significant advantage to +an attacker. The time $t$ refers to a \emph{period of time} or \emph{timeframe} +during which a visit should have taken place. Notably, different sources of WOs +may provide different \emph{resolutions} for time, forcing an attacker to +consider a timeframe in which a visit could have taken place. For example, +timestamps in Apache or nginx access logs use regular Unix timestamps as default +(i.e., seconds), while CDNs like Cloudflare maintain logs with Unix nanosecond +precision. Further, there are inherent limitations in approximating $t$ for the +query when the attacker in addition to WO access can only directly observe +traffic from the victim into Tor. We explore this later in +Section~\ref{cat:sec:sim:timeframe}. + +One important limitation we place on the use of a WO with WF is that the +attacker can only query the WO for \emph{monitored} websites. The open world +setting is intended to capture a more realistic setting for evaluating attacks, +and inherent in this is that the attacker cannot train (or even enumerate) all +possible websites on the web. Given the ability to enumerate and query all +possible websites gives the adversary a capability in line with a global passive +adversary performing correlation attacks, which is clearly outside of the threat +model of Tor~\cite{tor}. We further relate correlation and confirmation attacks +to WF+WO attacks in Section~\ref{cat:sec:related}. + +Definition~\ref{cat:def:oracle} defines the ideal WO: it never fails to observe a +\emph{monitored} website visit, it has no false positives, and it can answer for +an arbitrary $t$. This is similar to how encryption and decryption oracles +always encrypt and decrypt when modelling security for encryption +schemes~\cite{GoldwasserM84,NaorY90,RackoffS91}. In practice, sources of all of +these oracles may be more or less ideal and challenging for an attacker to use. +Nevertheless, the prevalence of sources of these imperfect oracles motivate the +assumption of an attacker with access to an ideal oracle. Similarly, for WOs, we +motivate this assumption in Sections~\ref{cat:sec:sources}~and~\ref{cat:sec:sim}, in +particular wrt.\ a timeframe on the order of (milli)seconds. +Section~\ref{cat:sec:disc} further considers non-ideal sources of WOs and the effect +on WF+WO attacks, both when the WO can produce false positives and when the +source only observes a fraction of visits to monitored websites. + +\subsection{Generic Website Fingerprinting Attacks with Website Oracles} +\label{cat:sec:oracles:generic} +As mentioned in Section~\ref{cat:sec:back:wf}, a WF attack is a classifier that is +given as input a packet trace and provides as output a classification. The +classification is either a monitored site or a class representing unmonitored +(in the open world). Figure~\ref{cat:fig:setting-oracle} shows the setting where an +attacker capable of performing WF attacks also has access to a WO. We define a +generic construction for WF+WO attacks that works with \emph{any} WF attack in +the open world in Definition~\ref{cat:def:oraclewf}: + +\begin{figure}[!t] + \centering + \includegraphics[width=\columnwidth]{src/cat/img/setting-oracle} + \caption{WF+WO attacks, where the WO infers membership of a particular website $w$ in the website anonymity set of all possible websites visited over Tor during a particular timeframe $t$.} + \label{cat:fig:setting-oracle} +\end{figure} + +\begin{definition}[Binary verifier] + \label{cat:def:oraclewf} + Given a website oracle $o$ and WF classification $c$ of a trace collected at + time $t$, if $c$ is a monitored class, query the oracle $o(c,t)$. Return $c$ + if the oracle returns true, otherwise return the unmonitored class. +\end{definition} + +Note that the WO is only queried when the WF \emph{classification} is for a +monitored website and that Definition~\ref{cat:def:oraclewf} is a generalisation of +the ``high precision'' DefecTor attack by Greschbach +\emph{et~al.}~\cite{DBLP:conf/ndss/GreschbachPRWF17}. In terms of +\emph{precision} and \emph{false positives}, the above generic WF+WO +construction is strictly superior to a WF attack without a WO. Assume that the +WF classification incorrectly classified an unmonitored trace as monitored, then +there is \emph{only a probability} that a WO also returns true, depending on the +probability that someone else visited the website in the same timeframe over +Tor. If it does not, then a false positive is prevented. That is, a WF attack +without WO access is identical to a WF attack with access to a useless WO that +always returns true; any improvements beyond that will only help the attacker in +ruling out false positives. We consider the impact on \emph{recall} later. + +We can further refine the use of WOs for the subset of WF attacks that support +providing as output an ordered list of predictions in decreasing likelihood, +optionally with probabilities, as shown in Definition~\ref{cat:def:oracleprob}: + +\begin{definition}[List verifier] + Given an ordered list of predictions in the open world and a website oracle: + + {\centering + \pseudocode[syntaxhighlight=auto]{% + \t\pcfor \text{top prediction $p$ in list} \pcdo \\ + \t\pcind \pcif p \text{ is unmonitored or oracle says $p$ visited } \t\pcthen\\ + \t\pcind[2] return \text{list}\\ + \t\pcind \text{move $p$ to last in list and optionally update probabilities}} + } + \label{cat:def:oracleprob} +\end{definition} + +First, we observe that if the WF attack thinks that it is most likely an +unmonitored website, then we accept that because a WO can only teach us +something new about monitored websites. Secondly, if the most likely prediction +has been visited according to the WO then we also accept that classification +result. Finally, all that is left to do is to consider this while repeatedly +iterating over the top predictions: if the top classification is a monitored +website that has not been visited according to the WO, then move it from the top +of the list and optionally update probabilities (if applicable, then also set +$p=0.0$ before updating) and try again. Per definition, we will either hit the +case of a monitored website that has been visited according to the WO or an +unmonitored prediction. As mentioned in Section~\ref{cat:sec:back:wf}, WF output +that has some sort of probability or threshold associated with classifications +are useful for attackers with different requirements wrt. false positives and +negatives. + +One could consider a third approach based on repeatedly querying a WO to first +determine if any monitored websites have been visited and then train an +optimised classifier (discarding monitored websites that we know have not been +visited). While this may give a minor improvement, our results later in this +paper as well as earlier work show that confusing monitored websites is a minor +issue compared to confusing an unmonitored website as +monitored~\cite{DBLP:conf/ndss/GreschbachPRWF17,DBLP:conf/ccs/JuarezAADG14,Wang}. diff --git a/summary/src/cat/src/othersources.tex b/summary/src/cat/src/othersources.tex new file mode 100644 index 0000000..35d3c71 --- /dev/null +++ b/summary/src/cat/src/othersources.tex @@ -0,0 +1,112 @@ +\section{Sources of Website Oracles} \label{cat:app:sources} +There are a wide number of possible sources to instantiate WOs. Here we present +some details on a selection of sources, far from exhaustive. + +\subsection{Surveillance Programmes} +Intelligence agencies operate surveillance programmes that perform bulk +collection and retention of communications metadata, including +web-browsing~\cite{lyon2014surveillance}. For example, the Snowden revelations +included \emph{Marina}: +\begin{quote} + Of the more distinguishing features, Marina has the ability to look back on + the last 365 days' worth of DNI (Digital Network Intelligence) metadata seen + by the Sigint collection system, \emph{regardless} whether or not it was + tasked for collection~\cite{guardian}. +\end{quote} + +Another example is the prevalence of nation states to monitor Internet traffic +that crosses geographic borders. For example, China operates the Great Firewall +of China that is also used for censorship purposes. Due to the nature of Tor and +how exits are selected, visits to websites that are not operated by world-wide +reaching hosting providers are highly likely to cross multiple nation borders as +traffic goes from an exit to the website. It is also worth to highlight that any +sensitive website hosted from within a country where a state actor is interested +in identifying visitors are likely to capture traffic to that website due to the +Tor traffic crossing its borders more often than not. + +\subsection{Content Delivery Networks} +Content Delivery Networks (CDNs), such as Akamai, Google, and Amazon host +different types of content for a significant fraction of all websites on the +Internet~\cite{ScheitleHGJZSV18}. Inherently, all requests for these resources +are easily identified as coming from Tor exits, and depending on content, things +like unique identifiers and HTTP referrer headers enable the CDN provider to +infer the website the content is hosted on. + +\subsection{Internet Giants} +Internet giants like Google, Apple, Facebook, Amazon, Microsoft, and Cloudflare +make up a large fraction the web as we know it. For example the use of Google +Analytics is wide-spread, so is hosting in clouds provided by several of these +giants, and Cloudflare with its ``cloud network platform'' hosts over 13 million +domains~\cite{cf-size}. +While some of them may do what is in their power to protect the valuable data +they process and retain, they are still subject to many legal frameworks across +the world that might not offer the best of protections for, say, access logs +pertaining to ``anonymous'' users of Tor when requested by authorities of nation +states. As another example, Cloudflare offers a nice API for their customers to +get their access logs with Unix nanosecond precision. The logs are retained for +up to seven +days~\cite{cf-retention}, +giving ample time for legal requests. + +\subsection{Access Logs of Web Servers} +The vast majority of web servers retain access logs by default. Typically, they +provide unix timestamps with seconds as the resolution (the case for Apache and +nginx). Further, the access logs may be shipped to centralised security +information and event management (SIEM) systems for analysis, with varying +retention times and rigour in storage. For example, it is common to +``anonymize'' logs by removing parts of the IP-addresses and then retaining them +indefinitely, as is the case for Google who removes part of IP addresses in logs +after nine +months~\cite{google-retention}. + + +\subsection{Middleboxes} +Network middleboxes that observe, analyse, and potentially retain network +traffic abound. Especially in more oppressive countries, middleboxes are often +used for censorship or dragnet surveillance, e.g., as seen with Blue Coat in +Syria~\cite{bluecoat}. + +\subsection{OCSP Responders} +Chung \emph{et~al.}~\cite{ocsp-chung} found in a recent study that 95.4\% of all +certificates support the Online Certificate Status Protocol (OCSP), which allows +a client to query the responsible CA in real-time for a certificate's revocation +status via HTTP. As such, the browsed website will be exposed to the CA in +question. From a privacy-standpoint this could be solved if the server +\emph{stapled} a recently fetched OCSP response with the served certificate. +Unfortunately, only 35\% of Alexa's top-one-million uses OCSP +stapling~\cite{ocsp-chung}. + +Unless an OCSP response is stapled while visiting a website in a default +configuration of the Tor browser, the status of a certificate is checked in +real-time using OCSP. As such, any CA that issued a certificate for a website +without OCSP stapling could instantiate a WO with an RTT-based resolution. +Similarly, any actor that observes most OCSP traffic (which is in plaintext due +to HTTP) gets the same capability. To better understand who could instantiate a +WO based on OCSP we performed preliminary traceroute measurements\footnote{% + Every RIPE Atlas probe used its configured DNS resolver(s). In total we + requested 2048 WW-probes for one-off measurements. +} on the RIPE Atlas network towards four OCSP +responders that are hosted by particularly large CAs: Let's Encrypt, Sectigo, +DigiCert, and GoDaddy. Let's Encrypt and Sectigo are fronted by a variety of +actors (mainly due to CDN caching), while DigiCert is fronted by a single CDN. +Requests towards GoDaddy's OCSP responder always end-up in an AS hosted by +GoDaddy. + +\subsection{Tor Exit Relays} +Anyone can run a Tor exit relay and have it be used by all Tor users. Obviously, +the operator of the exit relay can observe when its relay is used and the +destination websites. At the time of writing, the consumed exit bandwidth of the +entire Tor network is around 50~Gbit/s. This makes the necessary investment for +an attacker that wishes to get a decent chunk of exit bandwidth more a question +of stealthily deploying new exit relays than prohibitively large monetary costs. + +\subsection{Information Leaks} +More sophisticated attackers can look for information leaks at the application, +network, and operating system levels that allow them to infer that websites have +been visited. Application level information leaks are particularly of concern +for onion services: any observable state that can be tied to a new visitor is a +WO for an onion visit (this is not the case for ``regular'' websites). Such +state can include online status or the number of online users of a service, any +observable activity with timestamps, a predictable caching structure, and so on. +Similar information leaks can also occur on the network and operating system +level~\cite{DBLP:journals/ton/CaoQWDKM18,DBLP:conf/uss/EnsafiPKC10,DBLP:conf/ccs/QianMX12}. diff --git a/summary/src/cat/src/ref-min.bib b/summary/src/cat/src/ref-min.bib new file mode 100644 index 0000000..fe12c0e --- /dev/null +++ b/summary/src/cat/src/ref-min.bib @@ -0,0 +1,837 @@ +@misc{google-retention, + author = {Google LLC.}, + title = {How {Google} retains data we collect}, + howpublished = {\url{https://web.archive.org/web/20190227170903/https://policies.google.com/technologies/retention}}, +} + +@misc{cf-retention, + author = {Cloudflare Inc.}, + title = {{FAQs}}, + howpublished = {\url{https://web.archive.org/web/20190227165850/https://developers.cloudflare.com/logs/faq/}}, +} + +@misc{cf-size, + author = {Cloudflare Inc.}, + title = {Helping Build a Better Internet}, + howpublished = {\url{https://web.archive.org/web/20190227165133/https://www.cloudflare.com/}}, +} + +@misc{guardian, + author = {The Guardian}, + title = {{NSA} stores metadata of millions of web users for up to a year, secret files show}, + howpublished = {\url{https://www.theguardian.com/world/2013/sep/30/nsa-americans-metadata-year-documents}, accessed 2019-02-27}, +} + +@misc{wiki, + author = {Wikipedia contributors}, + title = {Softmax function---{Wikipedia}{,} the free encyclopedia.}, + howpublished = {\url{https://en.wikipedia.org/w/index.php?title=Softmax_function&oldid=883834589}, accessed 2019-02-17}, +} + +@misc{alexa, + author = {Amazon}, + title = {The top 500 sites on the web}, + howpublished = {\url{https://www.alexa.com/topsites}, accessed 2019-02-13}} +} + +@misc{tor-safety-board, + author = {Tor Project}, + title = {Tor Research Safety Board}, + howpublished = {\url{https://research.torproject.org/safetyboard.html}, accessed 2019-02-13}, +} + +@misc{google-bid-anon, + author = {Google LLC.}, + title = {Set your desktop and mobile web inventory to Anonymous, Branded, or Semi-transparent in {AdX}}, + howpublished = {\url{https://web.archive.org/web/20190228123602/https://support.google.com/admanager/answer/2913411?hl=en&ref_topic=2912022}}, +} + +@misc{google-bid, + author = {Google LLC.}, + title = {Real-Time Bidding Protocol Buffer v.161 }, + howpublished = {\url{https://web.archive.org/web/20190228122615/https://developers.google.com/authorized-buyers/rtb/downloads/realtime-bidding-proto}}, +} + +@misc{google-dn, + author = {Google LLC.}, + title = {About targeting for Display Network campaigns}, + howpublished = {\url{https://web.archive.org/web/20190228122431/https://support.google.com/google-ads/answer/2404191?hl=en&ref\_topic=3121944\%5C}}, +} + +@misc{google-purge, + author = {Google LLC.}, + title = {Flush Cache}, + howpublished = {\url{https://web.archive.org/web/20190228150306/https://developers.google.com/speed/public-dns/cache}}, +} + +@misc{cf-purge, + author = {Cloudflare Inc.}, + title = {Purge Cache}, + howpublished = {\url{https://web.archive.org/web/20190228150344/https://1.1.1.1/purge-cache/}}, +} + +@misc{bug-report, + author = {Tobias Pulls and Rasmus Dahlberg}, + title = {{OOM} manger wipes entire {DNS} cache}, + howpublished = {\url{https://trac.torproject.org/projects/tor/ticket/29617}}, + year = {2020}, +} + +@misc{tor-0401, + author = {Nick Mathewson}, + title = {New Release: {Tor} 0.4.0.1-alpha }, + howpublished = {\url{https://blog.torproject.org/new-release-tor-0401-alpha}, accessed 2019-02-08}, +} + +@misc{netcraft-survey, + author = {Netcraft}, + title = {January 2019 Web Server Survey}, + howpublished = {\url{https://web.archive.org/web/20190208081915/https://news.netcraft.com/archives/category/web-server-survey/}} +} + +@inproceedings{DBLP:conf/ccs/JuarezAADG14, + author = {Marc Ju{\'{a}}rez and + Sadia Afroz and + Gunes Acar and + Claudia D{\'{\i}}az and + Rachel Greenstadt}, + title = {A Critical Evaluation of Website Fingerprinting Attacks}, + booktitle = {{CCS}}, + year = {2014} +} + +@article{chow1970optimum, + title={On optimum recognition error and reject tradeoff}, + author={Chow, C}, + journal={IEEE Trans. Inf. Theory}, + volume={16}, + number={1}, + year={1970}, +} + +@inproceedings{stolerman2013classify, + title={Classify, but verify: Breaking the closed-world assumption in stylometric authorship attribution}, + author={Stolerman, Ariel and Overdorf, Rebekah and Afroz, Sadia and Greenstadt, Rachel}, + booktitle={IFIP Working Group}, + volume={11}, + year={2013} +} + +@inproceedings{DBLP:conf/ndss/GreschbachPRWF17, + author = {Benjamin Greschbach and + Tobias Pulls and + Laura M. Roberts and + Phillip Winter and + Nick Feamster}, + title = {The Effect of {DNS} on {Tor}'s Anonymity}, + booktitle = {{NDSS}}, + year = {2017}, +} + +@article{DBLP:journals/popets/WangG16, + author = {Tao Wang and + Ian Goldberg}, + title = {On Realistically Attacking {Tor} with Website Fingerprinting}, + journal = {PETS}, + volume = {2016}, + number = {4}, +} + +@inproceedings{DBLP:conf/uss/KwonALDD15, + author = {Albert Kwon and + Mashael AlSabah and + David Lazar and + Marc Dacier and + Srinivas Devadas}, + title = {Circuit Fingerprinting Attacks: Passive Deanonymization of {Tor} Hidden + Services}, + booktitle = {{USENIX} Security}, + year = {2015}, +} + +@inproceedings{DBLP:conf/wpes/PanchenkoMHLWE17, + author = {Andriy Panchenko and + Asya Mitseva and + Martin Henze and + Fabian Lanze and + Klaus Wehrle and + Thomas Engel}, + title = {Analysis of Fingerprinting Techniques for {Tor} Hidden Services}, + booktitle = {{WPES}}, + year = {2017}, +} + +@inproceedings{jansenccs18, + author = {Rob Jansen and Matthew Traudt and Nick Hopper}, + title = {Privacy-Preserving Dynamic Learning of {Tor} Network Traffic}, + booktitle = {{CCS}}, + year = {2018} +} + +@inproceedings{DBLP:conf/ccs/JansenJ16, + author = {Rob Jansen and + Aaron Johnson}, + title = {Safely Measuring {Tor}}, + booktitle = {{CCS}}, + year = {2016}, +} + +@inproceedings{DBLP:conf/wpes/VinesRK17, + author = {Paul Vines and + Franziska Roesner and + Tadayoshi Kohno}, + title = {Exploring {ADINT:} Using Ad Targeting for Surveillance on a Budget + - or - How Alice Can Buy Ads to Track Bob}, + booktitle = {{WPES}}, + year = {2017}, +} + +@article{riggingranking, + author = {Victor Le Pochat and + Tom van Goethem and + Wouter Joosen}, + title = {Rigging Research Results by Manipulating Top Websites Rankings}, + journal = {CoRR}, + volume = {abs/1806.01156}, + year = {2018}, +} + +@inproceedings{torusage, + author = {Akshaya Mani and + T. Wilson{-}Brown and + Rob Jansen and + Aaron Johnson and + Micah Sherr}, + title = {Understanding {Tor} Usage with Privacy-Preserving Measurement}, + booktitle = {{IMC}}, + year = {2018} +} + +@article{lyon2014surveillance, + title={Surveillance, {Snowden}, and big data: Capacities, consequences, critique}, + author={Lyon, David}, + journal={Big Data \& Society}, + volume={1}, + number={2}, + year={2014}, + publisher={SAGE Publications Sage UK: London, England} +} + +@inproceedings{Wang, + author = {Tao Wang and + Xiang Cai and + Rishab Nithyanand and + Rob Johnson and + Ian Goldberg}, + title = {Effective Attacks and Provable Defenses for Website Fingerprinting}, + booktitle = {{USENIX} Security}, + year = {2014}, +} + +@article{Cherubin17, + author = {Giovanni Cherubin}, + title = {Bayes, not Na{\"{\i}}ve: Security Bounds on Website Fingerprinting Defenses}, + journal = {PETS}, + volume = {2017}, + number = {4}, +} + +@inproceedings{Tamaraw, + author = {Xiang Cai and + Rishab Nithyanand and + Tao Wang and + Rob Johnson and + Ian Goldberg}, + title = {A Systematic Approach to Developing and Evaluating Website Fingerprinting + Defenses}, + booktitle = {{CCS}}, + year = {2014}, +} + +@inproceedings{csbuflo, + author = {Xiang Cai and + Rishab Nithyanand and + Rob Johnson}, + title = {{CS-BuFLO}: {A} Congestion Sensitive Website Fingerprinting Defense}, + booktitle = {{WPES}}, + year = {2014}, +} + +@inproceedings{DF, + author = {Payap Sirinam and + Mohsen Imani and + Marc Ju{\'{a}}rez and + Matthew Wright}, + title = {Deep Fingerprinting: Undermining Website Fingerprinting Defenses with + Deep Learning}, + booktitle = {{CCS}}, + year = {2018} +} + +@inproceedings{wtf-pad, + author = {Marc Ju{\'{a}}rez and + Mohsen Imani and + Mike Perry and + Claudia D{\'{\i}}az and + Matthew Wright}, + title = {Toward an Efficient Website Fingerprinting Defense}, + booktitle = {{ESORICS}}, + year = {2016} +} + +@inproceedings{WT, + author = {Tao Wang and + Ian Goldberg}, + title = {Walkie-Talkie: An Efficient Defense Against Passive Website Fingerprinting + Attacks}, + booktitle = {{USENIX} Security}, + year = {2017} +} + +@inproceedings{DynaFlow, + author = {David Lu and + Sanjit Bhat and + Albert Kwon and + Srinivas Devadas}, + title = {DynaFlow: An Efficient Website Fingerprinting Defense Based on Dynamically-Adjusting + Flows}, + booktitle = {{WPES}}, + year = {2018} +} + +@inproceedings{tor, + author = {Roger Dingledine and + Nick Mathewson and + Paul F. Syverson}, + title = {Tor: The Second-Generation Onion Router}, + booktitle = {{USENIX} Security}, + year = {2004} +} + +@inproceedings{trilemma, + author = {Debajyoti Das and + Sebastian Meiser and + Esfandiar Mohammadi and + Aniket Kate}, + title = {Anonymity Trilemma: Strong Anonymity, Low Bandwidth Overhead, Low + Latency - Choose Two}, + booktitle = {{IEEE} {S\&P}}, + year = {2018}, +} + +@inproceedings{SunSWRPQ02, + author = {Qixiang Sun and + Daniel R. Simon and + Yi{-}Min Wang and + Wilf Russell and + Venkata N. Padmanabhan and + Lili Qiu}, + title = {Statistical Identification of Encrypted Web Browsing Traffic}, + booktitle = {{IEEE S\&P}}, + year = {2002} +} + +@article{rtb, + author = {Jun Wang and + Weinan Zhang and + Shuai Yuan}, + title = {Display Advertising with Real-Time Bidding {(RTB)} and Behavioural + Targeting}, + journal = {Foundations and Trends in Information Retrieval}, + year = {2017} +} + +@inproceedings{ocsp-chung, + author = {Taejoong Chung and + Jay Lok and + Balakrishnan Chandrasekaran and + David R. Choffnes and + Dave Levin and + Bruce M. Maggs and + Alan Mislove and + John P. Rula and + Nick Sullivan and + Christo Wilson}, + title = {Is the Web Ready for {OCSP} Must-Staple?}, + booktitle = {{IMC}}, + year = {2018} +} + +@inproceedings{bluecoat, + author = {Chaabane Abdelberi and + Terence Chen and + Mathieu Cunche and + Emiliano De Cristofaro and + Arik Friedman and + Mohamed Ali K{\^{a}}afar}, + title = {Censorship in the Wild: Analyzing Internet Filtering in {Syria}}, + booktitle = {{IMC}}, + year = {2014} +} + +@inproceedings{PanchenkoNZE11, + author = {Andriy Panchenko and + Lukas Niessen and + Andreas Zinnen and + Thomas Engel}, + title = {Website fingerprinting in onion routing based anonymization networks}, + booktitle = {{WPES}}, + year = {2011} +} + +@inproceedings{Hintz02, + author = {Andrew Hintz}, + title = {Fingerprinting Websites Using Traffic Analysis}, + booktitle = {{PETS}}, + year = {2002} +} + +@inproceedings{HerrmannWF09, + author = {Dominik Herrmann and + Rolf Wendolsky and + Hannes Federrath}, + title = {Website fingerprinting: attacking popular privacy enhancing technologies + with the multinomial na{\"{\i}}ve-bayes classifier}, + booktitle = {{CCSW}}, + year = {2009}, +} + +@inproceedings{kfp, + author = {Jamie Hayes and + George Danezis}, + title = {k-fingerprinting: {A} Robust Scalable Website Fingerprinting Technique}, + booktitle = {{USENIX} Security}, + year = {2016}, +} + +@inproceedings{CaiZJJ12, + author = {Xiang Cai and + Xin Cheng Zhang and + Brijesh Joshi and + Rob Johnson}, + title = {Touching from a distance: website fingerprinting attacks and defenses}, + booktitle = {{CCS}}, + year = {2012}, +} + +@inproceedings{DBLP:conf/esorics/ShmatikovW06, + author = {Vitaly Shmatikov and + Ming{-}Hsiu Wang}, + title = {Timing Analysis in Low-Latency Mix Networks: Attacks and Defenses}, + booktitle = {{ESORICS}}, + year = {2006}, +} + +@misc{anonterm, + title={A terminology for talking about privacy by data minimization: Anonymity, unlinkability, undetectability, unobservability, pseudonymity, and identity management}, + author={Pfitzmann, Andreas and Hansen, Marit}, + publisher={Dresden, Germany}, + year={2010}, +} + +@inproceedings{JansenJGED18, + author = {Rob Jansen and + Marc Ju{\'{a}}rez and + Rafa Galvez and + Tariq Elahi and + Claudia D{\'{\i}}az}, + title = {Inside Job: Applying Traffic Analysis to Measure {Tor} from Within}, + booktitle = {{NDSS}}, + year = {2018} +} + +@article{GoldwasserM84, + author = {Shafi Goldwasser and + Silvio Micali}, + title = {Probabilistic Encryption}, + journal = {JCSS}, + volume = {28}, + number = {2}, + year = {1984}, +} + +@inproceedings{NaorY90, + author = {Moni Naor and + Moti Yung}, + title = {Public-key Cryptosystems Provably Secure against Chosen Ciphertext + Attacks}, + booktitle = {Proc. Annu. ACM Symp. Theory Comput.}, + year = {1990} +} + +@inproceedings{RackoffS91, + author = {Charles Rackoff and + Daniel R. Simon}, + title = {Non-Interactive Zero-Knowledge Proof of Knowledge and Chosen Ciphertext + Attack}, + booktitle = {{CRYPTO}}, + year = {1991} +} + +@inproceedings{Bleichenbacher98, + author = {Daniel Bleichenbacher}, + title = {Chosen Ciphertext Attacks Against Protocols Based on the {RSA} Encryption + Standard {PKCS} {\#}1}, + booktitle = {{CRYPTO}}, + year = {1998} +} + +@article{RonenGGSWY18, + author = {Eyal Ronen and + Robert Gillham and + Daniel Genkin and + Adi Shamir and + David Wong and + Yuval Yarom}, + title = {The 9 Lives of {Bleichenbacher's} {CAT:} New Cache ATtacks on {TLS} + Implementations}, + journal = {{IACR} Cryptology ePrint Archive}, + year = {2018}, +} + +@inproceedings{DBLP:conf/sp/DyerCRS12, + author = {Kevin P. Dyer and + Scott E. Coull and + Thomas Ristenpart and + Thomas Shrimpton}, + title = {Peek-a-Boo, {I} Still See You: Why Efficient Traffic Analysis Countermeasures + Fail}, + booktitle = {{IEEE} {S\&P}}, + year = {2012} +} + +@inproceedings{DBLP:conf/wpes/NithyanandCJ14, + author = {Rishab Nithyanand and + Xiang Cai and + Rob Johnson}, + title = {Glove: {A} Bespoke Website Fingerprinting Defense}, + booktitle = {{WPES}}, + year = {2014} +} + +@inproceedings{mathews2018understanding, + title={UNDERSTANDING FEATURE DISCOVERY IN WEBSITE FINGERPRINTING ATTACKS}, + author={Mathews, Nate and Sirinam, Payap and Wright, Matthew}, + booktitle={{WNYISPW}}, + year={2018}, +} + +@article{abe2016fingerprinting, + title={Fingerprinting attack on {Tor} anonymity using deep learning}, + author={Abe, Kota and Goto, Shigeki}, + journal={Proceedings of the Asia-Pacific Advanced Network}, + volume={42}, + year={2016} +} + +@inproceedings{DBLP:conf/ndss/RimmerPJGJ18, + author = {Vera Rimmer and + Davy Preuveneers and + Marc Ju{\'{a}}rez and + Tom van Goethem and + Wouter Joosen}, + title = {Automated Website Fingerprinting through Deep Learning}, + booktitle = {{NDSS}}, + year = {2018} +} + +@inproceedings{cumul, + author = {Andriy Panchenko and + Fabian Lanze and + Jan Pennekamp and + Thomas Engel and + Andreas Zinnen and + Martin Henze and + Klaus Wehrle}, + title = {Website Fingerprinting at Internet Scale}, + booktitle = {{NDSS}}, + year = {2016} +} + +@article{cheng1998traffic, + title={Traffic analysis of {SSL} encrypted web browsing}, + author={Cheng, Heyning and Avnur, Ron}, + journal={Project paper, University of Berkeley}, + year={1998} +} + +@inproceedings{DBLP:conf/sp/SunSWRPQ02, + author = {Qixiang Sun and + Daniel R. Simon and + Yi{-}Min Wang and + Wilf Russell and + Venkata N. Padmanabhan and + Lili Qiu}, + title = {Statistical Identification of Encrypted Web Browsing Traffic}, + booktitle = {{IEEE S\&P}}, + year = {2002} +} + +@inproceedings{DBLP:conf/ccs/LiberatoreL06, + author = {Marc Liberatore and + Brian Neil Levine}, + title = {Inferring the source of encrypted {HTTP} connections}, + booktitle = {{CCS}}, + year = {2006} +} + +@inproceedings{KedoganAP02, + author = {Dogan Kesdogan and + Dakshi Agrawal and + Stefan Penz}, + title = {Limits of Anonymity in Open Environments}, + booktitle = {{IH}}, + year = {2002} +} + +@inproceedings{DBLP:conf/pet/Danezis04, + author = {George Danezis}, + title = {The Traffic Analysis of Continuous-Time Mixes}, + booktitle = {{PETS}}, + year = {2004} +} + +@inproceedings{DBLP:conf/ih/DanezisS04, + author = {George Danezis and + Andrei Serjantov}, + title = {Statistical Disclosure or Intersection Attacks on Anonymity Systems}, + booktitle = {{IH}}, + year = {2004} +} + +@inproceedings{DBLP:conf/diau/BertholdPS00, + author = {Oliver Berthold and + Andreas Pfitzmann and + Ronny Standtke}, + title = {The Disadvantages of Free {MIX} Routes and how to Overcome Them}, + booktitle = {International Workshop on Design Issues in Anonymity and Unobservability}, + year = {2000}, +} + +@inproceedings{KesdoganP04, + author = {Dogan Kesdogan and + Lexi Pimenidis}, + title = {The Hitting Set Attack on Anonymity Protocols}, + booktitle = {{IH}}, + year = {2004} +} + +@inproceedings{TroncosoGPV08, + author = {Carmela Troncoso and + Benedikt Gierlichs and + Bart Preneel and + Ingrid Verbauwhede}, + title = {Perfect Matching Disclosure Attacks}, + booktitle = {{PETS}}, + year = {2008} +} + +@inproceedings{DiazSCP02, + author = {Claudia D{\'{\i}}az and + Stefaan Seys and + Joris Claessens and + Bart Preneel}, + title = {Towards Measuring Anonymity}, + booktitle = {{PETS}}, + year = {2002} +} + +@inproceedings{SerjantovD02, + author = {Andrei Serjantov and + George Danezis}, + title = {Towards an Information Theoretic Metric for Anonymity}, + booktitle = {{PETS}}, + year = {2002} +} + +@inproceedings{Raymond00, + author = {Jean{-}Fran{\c{c}}ois Raymond}, + title = {Traffic Analysis: Protocols, Attacks, Design Issues, and Open Problems}, + booktitle = {International Workshop on Design Issues in Anonymity and Unobservability}, + year = {2000} +} + +@inproceedings{Danezis03, + author = {George Danezis}, + title = {Statistical Disclosure Attacks}, + booktitle = {{IFIP SEC}}, + year = {2003} +} + +@inproceedings{MurdochD05, + author = {Steven J. Murdoch and + George Danezis}, + title = {Low-Cost Traffic Analysis of {Tor}}, + booktitle = {{IEEE} {S\&P}}, + year = {2005} +} + +@inproceedings{ChakravartySK10, + author = {Sambuddho Chakravarty and + Angelos Stavrou and + Angelos D. Keromytis}, + title = {Traffic Analysis against Low-Latency Anonymity Networks Using Available + Bandwidth Estimation}, + booktitle = {{ESORICS}}, + year = {2010} +} + +@inproceedings{MittalKJCB11, + author = {Prateek Mittal and + Ahmed Khurshid and + Joshua Juen and + Matthew Caesar and + Nikita Borisov}, + title = {Stealthy traffic analysis of low-latency anonymous communication using + throughput fingerprinting}, + booktitle = {{CCS}}, + year = {2011} +} + +@inproceedings{deepcorr, + author = {Milad Nasr and + Alireza Bahramali and + Amir Houmansadr}, + title = {DeepCorr: Strong Flow Correlation Attacks on {Tor} Using Deep Learning}, + booktitle = {{CCS}}, + year = {2018} +} + +@inproceedings{JohnsonWJSS13, + author = {Aaron Johnson and + Chris Wacek and + Rob Jansen and + Micah Sherr and + Paul F. Syverson}, + title = {Users get routed: traffic correlation on {Tor} by realistic adversaries}, + booktitle = {{CCS}}, + year = {2013} +} + +@inproceedings{BorisovDMT07, + author = {Nikita Borisov and + George Danezis and + Prateek Mittal and + Parisa Tabriz}, + title = {Denial of service or denial of security?}, + booktitle = {{CCS}}, + year = {2007} +} + +@inproceedings{SunEVLRCM15, + author = {Yixin Sun and + Anne Edmundson and + Laurent Vanbever and + Oscar Li and + Jennifer Rexford and + Mung Chiang and + Prateek Mittal}, + title = {{RAPTOR:} Routing Attacks on Privacy in {Tor}}, + booktitle = {{USENIX} Security}, + year = {2015} +} + +@inproceedings{ScheitleHGJZSV18, + author = {Quirin Scheitle and + Oliver Hohlfeld and + Julien Gamba and + Jonas Jelten and + Torsten Zimmermann and + Stephen D. Strowes and + Narseo Vallina{-}Rodriguez}, + title = {A Long Way to the Top: Significance, Structure, and Stability of Internet + Top Lists}, + booktitle = {{IMC}}, + year = {2018} +} + +@article{DBLP:journals/ton/CaoQWDKM18, + author = {Yue Cao and + Zhiyun Qian and + Zhongjie Wang and + Tuan Dao and + Srikanth V. Krishnamurthy and + Lisa M. Marvel}, + title = {Off-Path {TCP} Exploits of the Challenge {ACK} Global Rate Limit}, + journal = {{IEEE/ACM} Trans. Netw.}, + volume = {26}, + number = {2}, + year = {2018} +} + +@inproceedings{DBLP:conf/ccs/QianMX12, + author = {Zhiyun Qian and + Zhuoqing Morley Mao and + Yinglian Xie}, + title = {Collaborative {TCP} sequence number inference attack: how to crack + sequence number under a second}, + booktitle = {{CCS}}, + year = {2012} +} + +@inproceedings{DBLP:conf/uss/EnsafiPKC10, + author = {Roya Ensafi and + Jong Chun Park and + Deepak Kapur and + Jedidiah R. Crandall}, + title = {Idle Port Scanning and Non-interference Analysis of Network Protocol + Stacks Using Model Checking}, + booktitle = {{USENIX} Security}, + year = {2010} +} + +@misc{onionv2, + author = {{Tor Project}}, + title = {{Tor} Rendezvous Specification - Version 2}, + howpublished = {\url{https://gitweb.torproject.org/torspec.git/tree/rend-spec-v2.txt}, accessed 2019-02-13}, +} + +@misc{onionv3, + author = {{Tor Project}}, + title = {{Tor} Rendezvous Specification - Version 3}, + howpublished = {\url{https://gitweb.torproject.org/torspec.git/tree/rend-spec-v3.txt}, accessed 2019-02-13}, +} + +@inproceedings{Merget19, + author = {Robert Merget and Juraj Somorovsky and Nimrod Aviram and Craig Young and Janis Fliegenschmidt and Jörg Schwenk and Yuval Shavitt}, + title = {Scalable Scanning and Automatic Classification of {TLS} Padding Oracle Vulnerabilities}, + booktitle = {{USENIX} Security}, + year = {2019}, + note = {to appear} +} + +@inproceedings{DBLP:conf/pet/WinterKMHSLW14, + author = {Philipp Winter and + Richard K{\"{o}}wer and + Martin Mulazzani and + Markus Huber and + Sebastian Schrittwieser and + Stefan Lindskog and + Edgar R. Weippl}, + title = {Spoiled Onions: Exposing Malicious {Tor} Exit Relays}, + booktitle = {{PETS}}, + year = {2014} +} + +@phdthesis{Wang2015a, + author = {Tao Wang}, + title = {Website Fingerprinting: Attacks and Defenses}, + school = {University of Waterloo}, + year = {2015}, + howpublished = {\url{https://nymity.ch/tor-dns/pdf/Wang2015a.pdf}}, +} + +@misc{perryCrit, + author = {Mike Perry}, + title = {A Critique of Website Traffic Fingerprinting Attacks}, + howpublished = {\url{https://blog.torproject.org/critique-website-traffic-fingerprinting-attacks}, accessed 2019-02-08}, +} + +@article{DBLP:journals/jsac/ReedSG98, + author = {Michael G. Reed and Paul F. Syverson and David M. Goldschlag}, + title = {Anonymous connections and onion routing}, + journal = {{JSAC}}, + volume = {16}, + number = {4}, + year = {1998}, +} diff --git a/summary/src/cat/src/related.tex b/summary/src/cat/src/related.tex new file mode 100644 index 0000000..6c36654 --- /dev/null +++ b/summary/src/cat/src/related.tex @@ -0,0 +1,64 @@ +\section{Related Work} \label{cat:sec:related} +The combination of a WF attack with a WO is a type of Classify-Verify method as +proposed by Stolerman et al.~\cite{stolerman2013classify}, which in turn is a +type of rejection function as described by Chow~\cite{chow1970optimum}. Such a +method was first used in the context of WF by Juarez +\emph{et~al.}~\cite{DBLP:conf/ccs/JuarezAADG14} and later by Greschbach +\emph{et~al.} \cite{DBLP:conf/ndss/GreschbachPRWF17} to augment WF attacks with +inferences from observed DNS traffic. Note that the attack by Greschbach et al. +can be seen as a probabilistic WO due to the attacker under their threat model +only observing a fraction of DNS traffic from the Tor network. Our work builds +upon and generalises their work where DNS traffic is just one of many possible +sources to infer website visits from. Further, our DNS-based sources are usable +by anyone instead of relatively strong network attackers (or Google or +Cloudflare). + +All anonymity networks produce anonymity sets (per definition) that change with +observations by an attacker over time~\cite{Raymond00}. Modelling the behaviour +of an anonymity system (as a mix), what the attacker observes, and how the +anonymity sets change over time allows us to reason about how the attacker can +perform traffic analysis and break the anonymity provided by the +system~\cite{DiazSCP02,KedoganAP02,SerjantovD02}. Attacks along these lines are +many with more-or-less consistent terminology, including intersection attacks, +(statistical) disclosure attacks, and traffic confirmation +attacks~\cite{DBLP:conf/diau/BertholdPS00,Danezis03, +DBLP:conf/pet/Danezis04,DBLP:conf/ih/DanezisS04,KesdoganP04,Raymond00, +DBLP:journals/jsac/ReedSG98,TroncosoGPV08}. + +WOs are nothing more than applying the notion of anonymity sets to the potential +destination websites visited over an anonymity network like Tor and giving an +attacker the ability to query this anonymity set for membership for a limited +number of monitored websites. The way we use WOs in our generic attacks is +\emph{not to learn long-term statistically unlikely relationships} between +senders and recipients in a network. Rather, the WO is only used to learn +\emph{part of the anonymity set at the time of the attack}. That an attacker can +observe anonymity sets is not novel, what is novel in our work is how we apply +it to the WF domain and argue for its inclusion as a core attacker capability +when modelling WF attacks and defenses. + +Murdoch and Danezis showed how to use observed latency in Tor as an oracle to +perform traffic analysis attacks \cite{MurdochD05}. Chakravarty \emph{et~al.} +detailed similar attacks but based on bandwidth estimation +\cite{ChakravartySK10} and Mittal \emph{et~al.} using throughput +estimation~\cite{MittalKJCB11}. Attackers in these cases do not need to be +directly in control of significant fractions Tor, but rather use network +measurements to infer the state of the network and create an oracle that an +attacker can utilize, similar to WOs. + +Correlation of input and output flows is at the core of many attacks on +anonymity networks like Tor~\cite{BorisovDMT07,JohnsonWJSS13,SunEVLRCM15}. Flow +correlation attacks correlate traffic on the network layer, considering packet +sizes and timing of sent traffic. The RAPTOR attack by Sun et +al.~\cite{SunEVLRCM15} needs about 100MB of data sent over five minutes to +correlate flows with high accuracy. The recent state-of-the-art attack DeepCorr +by Nasr \emph{et~al.} \cite{deepcorr}---based on deep learning like Deep +Fingerprinting by Sirinam \emph{et~al.}~\cite{DF}---needs only about 900KB of +data (900 packets) for comparable accuracy to RAPTOR. While flow correlation +attacks like RAPTOR and DeepCorr operate on the network layer, WF+WO attacks can +be viewed as \emph{application layer} correlation attacks. WF attacks extract +the application-layer data (the website) while WOs reconstruct parts of the +anonymity set of possible monitored websites visited. WF attacks need to observe +most of the traffic generated when visiting a website that goes into the +anonymity network. While a WO does not have to directly view any of the output +flows of the network, it needs to be able to infer if a particular website was +visited during a period of time, as shown in Section~\ref{cat:sec:sources}. diff --git a/summary/src/cat/src/sim.tex b/summary/src/cat/src/sim.tex new file mode 100644 index 0000000..4077b89 --- /dev/null +++ b/summary/src/cat/src/sim.tex @@ -0,0 +1,131 @@ +\section{Simulating Website Oracles} \label{cat:sec:sim} +To be able to \emph{simulate} access to a WO for \emph{arbitrary monitored +websites} we need to simulate the entire website anonymity set of Tor, because +the anonymity set is what a WO queries for membership. We opt for simulation for +ethical reasons. The simulation has three key parts: how those visits are +distributed, the number of visits to websites over Tor, and the timeframe +(resolution) of the oracle source. Note that the first two parts are easy for an +attacker to estimate by simply observing traffic from live Tor exit relays, +something we cannot trivially do as researchers adhering to Tor's research +safety +guidelines~\cite{tor-safety-board}. +Another option available to an attacker is to repeatedly query a WO to learn +about the popularity of its monitored websites and based on those figures infer +the utility of the WO. We opted to not perform such measurements ourselves, +despite access to several WOs, due to fears of inadvertently harming Tor users. +Instead we base our simulations on results from the privacy-preserving +measurements of the Tor network in early 2018 by Mani +\emph{et~al.}~\cite{torusage}. + +\subsection{How Website Visits are Distributed} +\label{cat:sec:sim:dist} +Table~\ref{cat:table:visits} shows the average inferred website popularity from Mani +\emph{et~al.}~\cite{torusage}. The average percentage does not add up to 100\%, +presumably due to the privacy-preserving measurement technique or rounding +errors. Their results show that \texttt{torproject.org} is very popular (perhaps +due to a bug in software using Tor), and beyond that focus on +Alexa's~\cite{alexa} top one million most +popular websites as bins. The ``other'' category is for websites identified not +part of Alexa's top one million websites ranking. For the rest of the analysis +(not simulation) in this paper we \emph{exclude} \texttt{torproject.org}: for one, +that Tor users visit that website is unlikely to be an interesting fact for an +attacker to monitor, and its over-representation (perhaps due to a bug) will +skew our analysis. Excluding \texttt{torproject.org}, about one third of all +website visits go to Alexa (0,1k], one third to Alexa (1k,1m], and one third to +other websites. The third column of Table~\ref{cat:table:visits} contains adjusted +average percentages. + +\begin{table}[!t] +\caption{Inferred average website popularity for the entire Tor network early 2018, from Mani \emph{et~al.}~\cite[Figure 2]{torusage}.} +\centering +\begin{tabular}{lcc} +Website & Average & Without\\ + & primary domain (\%) & torproject.org\\ +\midrule +torproject.org & 40.1 & \\ +Alexa (0,10] & 8.4 & 13.9 \\ +Alexa (10,100] & 5.1 & 8.4 \\ +Alexa (100,1k] & 6.2 & 10.3 \\ +Alexa (1k,10k] & 4.3 & 7.1 \\ +Alexa (10k,100k] & 7.7 & 12.7\\ +Alexa (100k,1m] & 7.0 & 11.6 \\ +other & 21.7 & 35.9 \\ +\end{tabular} +\label{cat:table:visits} +\end{table} + +In our simulations for website visits we treat the entries in column two of +Table~\ref{cat:table:visits} as bins of a histogram with the relative size indicated +by the average website popularity. After randomly selecting a bin (weighted by +popularity), in the case of an Alexa range we uniformly select a website within +the range, and for the other category we uniformly select from one million other +websites. This is a conservative choice given that there are hundreds of +millions of active websites on the Internet. Uniformly selecting within a bin +will make the more popular websites in the bin likely underrepresented while +less popular websites in the bin get overrepresented. However, we typically +simulate an attacker that monitors $\approx$100 websites and use the website +popularity as the starting rank of the first monitored website. For the most +popular websites, monitoring 100 websites covers the entire or significant +portions of the bins (Alexa $\leq$1k), and for less popular websites (Alexa +$>$1k), as our results later show, this does not matter. + +\subsection{The Number of Website Visits} +Mani \emph{et~al.} also inferred with a 95\% confidence interval that +$(104\pm36)*10^6$ \emph{initial} streams are created during a 24 hour period in +the entire Tor network \cite{torusage}. Based on this, in our simulation we +assume 140 million website visits per day that are distributed as described +above and occur uniformly throughout the day. While assuming uniformity is +naive, we selected the upper limit of the confidence interval to somewhat negate +any unreasonable advantage to the attacker. + +\subsection{A Reasonable Timeframe} +\label{cat:sec:sim:timeframe} +Wang and Goldberg show that it is realistic to assume that an attacker can +determine the start of a webpage load even in the presence of background noise +and multiple concurrent website visits~\cite{DBLP:journals/popets/WangG16}. An +attacker can further determine if a circuit is used for onion services or +not~\cite{DBLP:conf/uss/KwonALDD15,DBLP:conf/wpes/PanchenkoMHLWE17}. Now, +consider an attacker that observes traffic between a Tor client and its guard. +The initial stream contains the first HTTP GET request for a typical website +visit. The request will be the first outgoing packet as part of a website visit +once a connection has been established. When the request arrives at the +destination is the point in time when an oracle, e.g., instantiated by access +logs would record this time as the time of visit. Clearly, the exact time is +between the request and the response packets and the attacker observes the +timing of those packets. So what is a realistic timeframe for the attacker to +use when it queries a WO? + +Between January 22--30 (2019) we performed Round-Trip Time (RTT) measurements +using four Amazon EC2 instances that ran \emph{their own} nginx HTTP(S) servers +to visit \emph{themselves} over Tor (with \texttt{torify curl}) using a fresh +circuit for each visit. This allowed us easy access to start and stop times for +the RTT measurement, as well as the time a request appeared in the nginx access +log (without any clock-drift). In total we collected 21,497 HTTP traces and +21,492 HTTPS traces, where each trace contains start, log, and stop timestamps. +Our results are shown in Figure~\ref{cat:fig:aws}. It is clear that that log-to-stop +times are independent of HTTP(S). More than half of all log-to-stop times +($54.5$\%) are within a 100~ms window (see 40--140~ms), and nearly all +log-to-stop times are less than 1000~ms. + +\begin{figure}[!t] + \centering + \includegraphics[width=0.7\textwidth]{src/cat/img/aws} + \caption{% + Time differences between start, log, and stop events when visiting a website over HTTP(S) using Tor. + } + \label{cat:fig:aws} +\end{figure} + +Based on our experiment results we consider three timeframes relevant: 10 ms, +100 ms, and 1000 ms. First, 10 ms is relevant as close to optimal for any +attacker. On average, there are only 17 website visits during a 10 ms window in +the entire Tor network. 100 ms is our default for the WF experiments we perform: +we consider it realistic for many sources of WOs (e.g., Cloudflare logs and +real-time bidding). We also consider a 1000 ms timeframe relevant due to the +prevalence of sources of WOs with a resolution in seconds (e.g., due to Unix +timestamps or TTLs for DNS). Based on our simulations and the different +timeframes, Appendix~\ref{cat:app:bayes} contains an analysis of the utility of WOs +using Bayes' law. Appendix~\ref{cat:app:lessons} presents some key lessons from the +simulation, in particular that while the resolution and resulting timeframe is +an important metric in our simulation, it is minor in comparison to the overall +website popularity in Tor of the monitored websites. diff --git a/summary/src/cat/src/sources.tex b/summary/src/cat/src/sources.tex new file mode 100644 index 0000000..89616ad --- /dev/null +++ b/summary/src/cat/src/sources.tex @@ -0,0 +1,204 @@ +\section{Sources of Website Oracles} \label{cat:sec:sources} +There are a wide range of potential sources of WOs. Table~\ref{cat:table:sources} +summarizes a selection of sources that are more thoroughly detailed in +Appendix~\ref{cat:app:sources}. The table shows the availability of the source, +i.e., if the attacker needs to query the source in near real-time as a website +visit occurs or if it can be accessed retroactively, e.g., through a legal +request. We also estimate qualitatively the false positive rate of the source, +its coverage of websites it can monitor (or fraction of Tor network traffic, +depending on source), as well as the estimated effort to access the source. +Finally, the table gives an example of an actor with access to the source. + +Next we focus on a number of sources of WOs that we find particularly relevant: +several due to DNS in Section~\ref{cat:sec:sources:dns}, the DHT of Tor onion +directory services in Section~\ref{cat:sec:sources:dht}, and real-time bidding +platforms in Section~\ref{cat:sec:sources:rtb}. + +\begin{sidewaystable} + \caption{Comparison of a number of WO sources based on their \emph{estimated} time of availability (when attacker likely has to collect data, i.e., retroactively or real-time), False Positive Rate (FPR), coverage of website/network visits, and primary entities with access.} + \centering + \label{cat:table:sources} + \begin{tabular}{l c c c c r} + Source & Availability & FPR & Coverage & Effort & Access \\ \midrule + Dragnet surveillance programmes & retroactive & negl. & high & high & intelligence agencies \\ + Content Delivery Networks & retroactive & negl. & high & high & operators\\ + Real-time bidding & real-time (retroactive) & negl. & high & modest & customers (operator)\\ + Webserver access logs & retroactive & negl. & high & medium & operators\\ + Middleboxes & retroactive~\cite{bluecoat} & negl. & medium & medium & operators \\ + OCSP & retroactive & low & high & medium & few CAs, plaintext\\ + 8.8.8.8 operator & retroactive & low~\cite{DBLP:conf/ndss/GreschbachPRWF17} & 16.8\% of visits & high & Google, plaintext \\ + 1.1.1.1 operator & retroactive & low~\cite{DBLP:conf/ndss/GreschbachPRWF17} & 7.4\% of visits & high & Cloudflare, plaintext \\ + Exit relays & real-time & negl. & low & low & operators \\ + Exit relays DNS cache & real-time & medium & high & medium & anyone\\ + Query DNS resolvers & real-time & high & low & low & anyone \\ + Onion v2 (v3) & real-time & negl. & high (low) & low (high) & anyone \\ + % & & & & & \\ + \end{tabular} +\end{sidewaystable} + +\subsection{DNS} +\label{cat:sec:sources:dns} +Before a website visit the corresponding domain name must be resolved to an IP +address. For a user that uses Tor browser, the exit relay of the +current circuit resolves the domain name. If the DNS record of the domain +name is already cached in the DNS cache of the exit relay, then the exit relay +uses that record. Otherwise the domain name is resolved and subsequently cached +using whichever DNS resolution mechanism that the exit relay has configured. +Based on this process we present three sources of WOs that work for unpopular +websites. + +\subsubsection{Shared Pending DNS Resolutions} +If an exit relay is asked to resolve a domain name that is uncached it will +create a list of pending connections waiting for the domain resolution to +finish. If another connection asks that the same domain name be resolved, it is +added to the list of pending connections. When a result is available all +pending connections are informed. This is the basis of a WO: if a request to +resolve a domain name returns a record \emph{more quickly than previously +measured by the attacker for uncached entries}, the entry was either pending +resolution at the time of the request or already cached. Notably this works +regardless of if exit relays have DNS caches or not. However, the timing +constraints of shared pending connections are significant and thus a practical +hurdle to overcome. + +\subsubsection{Tor's DNS Cache at Exit Relays} +If an unpopular website is visited by a user, the resolved domain name will +likely be cached by a \emph{single} relay. We +performed 411 \texttt{exitmap}~\cite{DBLP:conf/pet/WinterKMHSLW14} +measurements between April 1--10 (2019), collecting on average 3544 +(un)cached data points for each exit using a domain under our control +that is not used by anyone else. + +Given a labelled data set of (un)cached times for each exit relay, we can +construct distinct \emph{per-relay} classifiers that predict whether a measured +time corresponds to an (un)cached domain name. While there are many different +approaches that could be used to build such a classifier, we decided to use a +simple heuristic that should result in little or no false positives: output +`cached' iff no uncached query has \emph{ever} been this fast before. +Figure~\ref{cat:fig:dns:classifier-idea} shows the idea of this classifier in +greater detail, namely create a \emph{threshold} that is the minimum of the +largest cached time and the smallest uncached time and then say cached iff the +measured time is smaller than the threshold. Regardless of how well this +heuristic performs (see below), it should be possible to construct other +classifiers that exploit the trend of smaller resolve times and less standard +deviation for cached queries (Figure~\ref{cat:fig:dns:dist}). For example, 69.1\% of +all exit relays take at least 50~ms more time to resolve an uncached domain on +average. + +\begin{figure}[!t] + \centering + \includegraphics[width=.7\columnwidth]{src/cat/img/dns__classifier-idea} + \caption{The two cases when deciding on a classifier's threshold.} + \label{cat:fig:dns:classifier-idea} +\end{figure} + +\begin{figure}[!t] + \centering + \includegraphics[width=.7\columnwidth]{src/cat/img/dns__timing-dist} + \caption{% + The difference between (un)cached standard deviation and mean times + without any absolute values, i.e., a negative value implies + that the uncached time is smaller than the cached time. + } + \label{cat:fig:dns:dist} +\end{figure} + + +To estimate an \emph{upper bound} on how effective the composite classifier of +all per-relay classifiers could be \emph{without any false positives} using +our heuristic, we applied ten-fold cross-validation to simply exclude every +exit relay that had false positives during any fold and then weighted the +observed bandwidth for the remaining classifiers by the individual true positive +rates. This gives us an +estimate of how much bandwidth we could predict true positives for without +having any false positives. By comparing it to the total exit bandwidth of the +Tor network, we obtain an estimated upper bound true positive rate for the +composite classifier of $17.3$\%. + +When an attacker measures if a domain is cached or not the domain will, after +the measurement, be cached for up to an hour (current highest caching duration +in Tor, independent of TTL) at every exit. However, if a an attacker can cause +an exit to run low on memory, the entire DNS cache will be removed (instead of +only parts of it) due to a bug in the out-of-memory manager of Tor. We have +reported this to the Tor +Project~\cite{bug-report}. +We further discuss in Section~\ref{cat:sec:disc} how frequently an attacker on +average can be expected to query a WO. + +\subsubsection{Caching at Recursive DNS Resolvers} +For a website that is unpopular enough, there is a high chance that nobody on +the web visited the website within a given timeframe. This is the basis of +our next idea for a WO which is \emph{not mutually exclusive} to the Tor +network: wait a couple of seconds after observing a connection, then probe all +recursive DNS resolvers of Tor exits that can be accessed to determine whether +any monitored website was cached approximately at the time of observing the +connection by inspecting TTLs. %the TTL of returned DNS records. + +In 2016 Greschbach~\emph{et~al.}~\cite{DBLP:conf/ndss/GreschbachPRWF17} showed +that remote DNS resolvers like Google's \texttt{8.8.8.8} receive a large +portion of all DNS traffic that exit relays generate. To better understand how +the DNS resolver landscape looks today, we repeated their RIPE Atlas experiment +setup for 35 hours in February 17--18 (2019), measuring every 30 minutes. Our +results show that Google (16.8\%) and Cloudflare (7.4\%) are both popular. Many +exits use a same-AS resolver which is presumably the ISP (42.3\%), while other +exits resolve themselves (15.2\%) or use a remote DNS resolver that we did not +identify (18.2\%). Further, we note that there are at least one RIPE Atlas +network measurement probe in the same AS as 53.3\% of all exits, providing +access to many of the same DNS resolvers as used by exits from a similar network +vantage point. + +Instead of using RIPE Atlas nodes we opted for a different approach which is +\emph{strictly worse}: query Google's and Cloudflare's DNS resolvers from VMs in +16 Amazon EC2 regions. With a simple experiment of first visiting a unique +domain (once again under our control and only used by us) using \texttt{torify +curl} and then querying the DNS resolvers from each Amazon VM to observe TTLs, +we got true positive rates of 2.9\% and 0.9\% for Google and Cloudflare with +1000 repetitions. While this may seem low, the cost for an attacker is at the +time of writing about 2 USD per day using on-demand pricing. Using an identical +setup we were also able to find a subset of monitored websites that yield +alarmingly high true positive rates: 61.4\% (Google) and 8.0\% (Cloudflare). +Presumably this was due to the cached entries being shared over a wider +geographical area for some reason (however, not globally). Regardless, coupled +with the fact that anyone can \emph{globally purge} the DNS caches of +Google~\cite{google-purge} and Cloudflare~\cite{cf-purge} for arbitrary domain +names, this is a noteworthy WO source. + +\subsection{Onion Service Directories in Tor} +\label{cat:sec:sources:dht} +To access an onion service a user first obtains the service's \emph{descriptor} +from a Distributed Hash Table (DHT) maintained by \emph{onion service +directories}. From the descriptor the user learns of \emph{introduction points} +selected by the host of the onion service in the Tor network that are used to +establish a connection to the onion service in a couple of more +steps~\cite{onionv2,onionv3} that are irrelevant here. Observing a request for +the descriptor of a monitored onion service is a source for a WO. To observe +visits for a target (known) onion service in the DHT, a relay first has to be +selected as one out of six or eight (depending on version) relays to host the +descriptor in the DHT, and then the victim has to select that relay to retrieve +the descriptor. For v2 of onion services, the location in the DHT is +deterministic~\cite{onionv2} and an attacker can position its relays in such a +way to always be selected for hosting target descriptors. Version 3 of onion +services addresses this issue by randomising the process every 24 +hours~\cite{onionv3}, forcing an attacker to host a significant number of relays +to get a WO for onion services with high coverage. At the time of writing, there +are about 3,500 relays operating as onion service directories. + +\subsection{Real-Time Bidding} +\label{cat:sec:sources:rtb} +Real-Time Bidding (RTB) is an approach towards online advertisement that allows +a publisher to auction ad space to advertisers on a \emph{per-visit} basis in +real time~\cite{rtb}. Google's Display Network includes more than two million +websites that reach 90\% of all Internet users~\cite{google-dn}, and an +advertiser that uses RTB must respond to submitted bid +requests containing information such as the three first network bytes of an IPv4 +address, the second-level domain name of the visited website, and the user agent +string within $\approx$100~ms~\cite{google-bid}. While the exact information +available to the bidder depends on the ad platform and the publisher's +advertisement settings, anonymous modes provide less +revenue~\cite{google-bid-anon}. +Combined with many flavours of pre-targeting such as IP and location +filtering~\cite{DBLP:conf/wpes/VinesRK17}, it is likely that the bidder knows +whether a user used Tor while accessing a monitored website. Vines et +al.~\cite{DBLP:conf/wpes/VinesRK17} further note that ``35\% of the DSPs also +allow arbitrary IP white-and blacklisting (Admedo, AdWords, Bing, BluAgile, +Criteo, Centro, Choozle, Go2Mobi, Simpli.fi)''. Finally, observe that an +attacker need not win a bid to use RTB as a WO. diff --git a/summary/src/cat/src/wf.tex b/summary/src/cat/src/wf.tex new file mode 100644 index 0000000..6b2dedb --- /dev/null +++ b/summary/src/cat/src/wf.tex @@ -0,0 +1,181 @@ +\section{Deep Fingerprinting with Website Oracles} \label{cat:sec:wf} +We first describe how we augment the Deep Fingerprinting (DF) attack by Sirinam +\emph{et~al.}~\cite{DF} with WO access. Next we evaluate the augmented +classifier on three different datasets with five different WF defenses. Source +code and datasets for simulating WF+WO attacks as well as steps to reproduce all +of the following results using DF are available at +\href{https://github.com/pylls/wfwo}{https://github.com/pylls/wfwo}. + +\subsection{The Augmented Classifier} +\label{cat:sec:wf:aug} +As covered in the background (Section~\ref{cat:sec:back:wf}), DF is a CNN where the +last layer is a softmax. The output is an array of probabilities for each +possible class. Compared to the implementation of DF used by Sirinam +\emph{et~al.}, we changed DF to not first use binary classification in the open +world to determine if it is an unmonitored trace or not, but rather such that +there is one class for each monitored website and one for unmonitored. +Conceptually, this slightly lowers the performance of DF in our analysis, but +our metrics show that mistaking one monitored website for another is +insignificant for the datasets used in the analysis of this paper. The principal +source of false positives is mistaking an unmonitored website for a monitored. + +Given the probability of each possible class as output of DF, we used the second +generic construction (Definition~\ref{cat:def:oracleprob}) from +Section~\ref{cat:sec:oracles:generic} to combine DF with a WO. To update the +remaining probabilities after removing a (monitored) prediction with the help of +the WO, we use a softmax again. However, due to how the softmax function is +defined, it emphasizes differences in values above one and actually +de-emphasizes values between zero and +one~\cite{wiki}. +This is problematic for us because all values we send through the softmax are +probabilities that per definition are between zero and one. To account for this, +we first divide each probability with the maximum probability and multiply with +a constant before performing the softmax. Through trial-and-error, a constant of +five gave us a reasonable threshold in probabilities. Note that this does not in +any way affect the order of likely classes from DF, it simply puts the +probabilities in a span that makes it easier for us to retain a threshold value +between zero and one after multiple calls to the softmax function. + +\subsection{WTF-PAD and Walkie-Talkie} +We use the original dataset of Sirinam \emph{et~al.}~\cite{DF} that consists of +95 monitored websites with 1,000 instances each as well as 20,000 unmonitored +websites (95x1k+20k). The dataset is split 8:1:1 for training, validation, and +testing, respectively. Given the dataset and our changes to DF to not do binary +classification means that our testing dataset is unbalanced in terms of +instances per class. Therefore we show precision-recall curves generated by +alternating the threshold for DF with and without WO access. + +Figure~\ref{cat:fig:df} shows the results of DF and DF+WO with a simulated WO on +Sirinam \emph{et~al.}'s dataset with no defense (Figure~\ref{cat:fig:df:nodef}), +Walkie-Talkie (Figure~\ref{cat:fig:df:wt}), and WTF-PAD +(Figure~\ref{cat:fig:df:wtfpad}). For the WO we use a 100 ms timeframe and plot the +results for different starting Alexa ranks of the 95 monitored websites. +Regardless of defense or not, we observe that for Alexa ranks 1k and less +popular websites the precision is perfect (1.0) regardless of threshold. This +indicates that---for an attacker monitoring frontpages of websites---a 100 ms WO +significantly reduces false positives for two-thirds of all website visits made +over Tor, for the vast majority of potentially monitored frontpages of websites. +Recall is also slightly improved. + +\begin{figure}[!t] + \centering + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/df_nodef} + \caption{No defense.} + \label{cat:fig:df:nodef} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/df_wt} + \caption{Walkie-Talkie~\cite{WT}.} + \label{cat:fig:df:wt} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/df_wtfpad} + \caption{WTF-PAD~\cite{wtf-pad}.} + \label{cat:fig:df:wtfpad} + \end{subfigure} + \caption{Attack simulation for Deep Fingerprinting (DF) with website oracles (100 ms timeframe) on Sirinam \emph{et~al.}'s dataset~\cite{DF}. The lines in each sub-figure show DF with and without website oracle access for different starting Alexa ranks for monitored websites.} + \label{cat:fig:df} +\end{figure} + +For Walkie-Talkie we observe a significant improvement in precision due to WO +access. Wang and Goldberg note that the use of popular websites as decoy +(non-sensitive) websites protects less-popular sensitive websites due to the +base rate: an attacker claiming that the user visited the less-popular website +is (per definition) likely wrong, given that the attacker is able to detect both +potential website visits~\cite{WT}. Access to a WO flips this observation on its +head: if a WO detects the sensitive less-popular website, the base rate works in +reverse. The probability of an unpopular website being both miss-classified and +visited in the timeframe is small for all but the most popular websites. The key +question becomes one of belief in the base rate of the network and that of the +target user, as analysed in Appendix~\ref{cat:app:bayes}. + +Further, WO access improves both recall and precision for all monitored websites +against WTF-PAD. WTF-PAD only provides a three percentage points decrease in +recall compared to no defense for monitored websites with Alexa ranks 1k and +above. + +\subsection{CS-BuFLO and Tamaraw} +To evaluate the constant-rate defenses CS-BuFLO and Tamaraw by Cai et +al.~\cite{csbuflo,Tamaraw} we use Wang \emph{et~al.}'s dataset in the open world +\cite{Wang}. The dataset consists of 100 monitored websites with 90 instances +each and 9000 unmonitored sites (100x90+9k), that we randomly split (stratified) +into 8:1:1 for training, validation, and testing. We had to increase the length +of the input to DF for this dataset, from 5000 to 25000, to ensure that we +capture most of the dataset. To get defended traces for CS-BuFLO and Tamaraw we +use the slightly modified implementations as part of Cherubin's +framework~\cite{Cherubin17}. + +Figure~\ref{cat:fig:wang} shows the results of our simulations. DF alone is also +highly effective against the original Wang dataset---as expected---and our +attack simulation shows that we can further improve it with access to website +oracles. Most importantly, both CS-BuFLO and Tamaraw offer protection against DF +with and without oracle access by \emph{significantly lowering recall}. Tamaraw +offers an order of magnitude better defense in terms of recall. As implemented +in the framework by Cherubin, CS-BuFLO and Tamaraw reportedly has BOH 67.2\% and +256.7\%, and TOH 575.6\% and 341.4\%, respectively. This kind of overhead is +likely prohibitively large for real-world deployment in +Tor~\cite{csbuflo,Tamaraw,wtf-pad,DF,WT}. + +\begin{figure}[!t] + \centering + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/wang_nodef} + \caption{No defense.\\\,} + \label{cat:fig:wang:nodef} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/wang_csbuflo} + \caption{CS-BuFLO~\cite{csbuflo}, with reported 67.2\% BOW and 575.6\% TOH~\cite{Cherubin17}.} + \label{cat:fig:wang:csbuflo} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/wang_tamaraw} + \caption{Tamaraw~\cite{Tamaraw}, with reported 256.7\% BOW and 341.4\% TOH~\cite{Cherubin17}.} + \label{cat:fig:wang:tamaraw} + \end{subfigure} + \caption{Attack simulation for Deep Fingerprinting (DF)~\cite{DF} with website oracles (100 ms timeframe) on Wang \emph{et~al.}'s dataset~\cite{Wang}. The lines in each sub-figure show DF with and without website oracle access for different starting Alexa ranks for monitored websites.} + \label{cat:fig:wang} +\end{figure} + + +\subsection{DynaFlow} +DynaFlow is a \emph{dynamic} constant-rate defense by Lu +\emph{et~al.}~\cite{DynaFlow} with two configurations that result in different +overheads and levels of protection. Lu \emph{et~al.} gathered their own dataset +of 100 monitored websites with 90 instances each and 9000 unmonitored websites +(100x90+9k, same as Wang \emph{et~al.}'s \cite{Wang}) to be able to combine +smaller packets, as discussed briefly in Section~\ref{cat:sec:back:wf}. As for +CS-BuFLO and Tamaraw, we had to increase the length of the input to DF for this +dataset to 25000 to ensure that we capture most of the dataset. + +Figure~\ref{cat:fig:dynaflow} shows the results of our simulations for no defense as +well as the two configurations of DynaFlow. As for Wang \emph{et~al.}'s +dataset~\cite{Wang}, we see as expected that DF is highly effective and WO +access further improves the attack. Further, both configurations of DynaFlow are +effective defenses, comparable to CS-BuFLO with significantly lower overheads at +first glance. However, note that the comparison is problematic due to DynaFlow +combining smaller packets. The extra overhead for config 2 over 1 is not wasted: +recall is significantly reduced, more than halved for regular DF and slightly +less than half with a WO. + +\begin{figure}[!t] + \centering + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/dynaflow_nodef} + \caption{No defense.\\\,} + \label{cat:fig:dynaflow:nodef} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/dynaflow_config1} + \caption{DynaFlow~\cite{DynaFlow} config 1, with measured 59\% BOH and 24\% TOH.} + \label{cat:fig:dynaflow:config1} + \end{subfigure} + \begin{subfigure}{.495\columnwidth} + \includegraphics[width=1\textwidth]{src/cat/img/dynaflow_config2} + \caption{DynaFlow~\cite{DynaFlow} config 2, with measured 109\% BOH and 30\% TOH.} + \label{cat:fig:dynaflow:config2} + \end{subfigure} + \caption{Attack simulation for Deep Fingerprinting (DF)~\cite{DF} with website oracles (100 ms timeframe) on Lu \emph{et~al.}'s dataset~\cite{DynaFlow}. The lines in each sub-figure show DF with and without website oracle access for different starting Alexa ranks for monitored websites.} + \label{cat:fig:dynaflow} +\end{figure} -- cgit v1.2.3