From 98d7d5ede3be12c69baeced525abd29a92255e68 Mon Sep 17 00:00:00 2001
From: Riku-Laine <28960190+Riku-Laine@users.noreply.github.com>
Date: Fri, 2 Aug 2019 13:36:52 +0300
Subject: [PATCH] Report outline added

---
 paper/biblio.bib |  84 +++++++++-
 paper/macros.tex |   4 +
 paper/sl.tex     | 427 ++++++++++++++++++++++++++++++-----------------
 3 files changed, 351 insertions(+), 164 deletions(-)

diff --git a/paper/biblio.bib b/paper/biblio.bib
index 2f14a55..5ac1b8e 100755
--- a/paper/biblio.bib
+++ b/paper/biblio.bib
@@ -1,8 +1,78 @@
+%% This BibTeX bibliography file was created using BibDesk.
+%% http://bibdesk.sourceforge.net/
+
+%% Created for Laine, Riku P at 2019-08-02 09:38:23 +0300 
+
+
+%% Saved with string encoding Unicode (UTF-8) 
+
+
+
+@inproceedings{tolan2019why,
+	Acmid = {3326705},
+	Address = {New York, NY, USA},
+	Author = {Tolan, Song\"{u}l and Miron, Marius and G\'{o}mez, Emilia and Castillo, Carlos},
+	Booktitle = {Proceedings of the Seventeenth International Conference on Artificial Intelligence and Law},
+	Date-Added = {2019-08-02 06:37:23 +0000},
+	Date-Modified = {2019-08-02 06:38:23 +0000},
+	Doi = {10.1145/3322640.3326705},
+	Isbn = {978-1-4503-6754-7},
+	Keywords = {algorithmic bias, algorithmic fairness, criminal recidivism, machine learning, risk assessment},
+	Location = {Montreal, QC, Canada},
+	Numpages = {10},
+	Pages = {83--92},
+	Publisher = {ACM},
+	Series = {ICAIL '19},
+	Title = {Why Machine Learning May Lead to Unfairness: Evidence from Risk Assessment for Juvenile Justice in Catalonia},
+	Url = {http://doi.acm.org/10.1145/3322640.3326705},
+	Year = {2019},
+	Bdsk-Url-1 = {http://doi.acm.org/10.1145/3322640.3326705},
+	Bdsk-Url-2 = {http://dx.doi.org/10.1145/3322640.3326705}}
+
+@article{pearl2010introduction,
+	Author = {Pearl, Judea},
+	Date-Added = {2019-08-02 06:37:23 +0000},
+	Date-Modified = {2019-08-02 06:37:23 +0000},
+	Journal = {The international journal of biostatistics},
+	Number = {2},
+	Publisher = {De Gruyter},
+	Title = {An introduction to causal inference},
+	Volume = {6},
+	Year = {2010}}
+
 @inproceedings{lakkaraju2017selective,
-  title={The selective labels problem: Evaluating algorithmic predictions in the presence of unobservables},
-  author={Lakkaraju, Himabindu and Kleinberg, Jon and Leskovec, Jure and Ludwig, Jens and Mullainathan, Sendhil},
-  booktitle={Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
-  pages={275--284},
-  year={2017},
-  organization={ACM}
-}
+	Acmid = {3098066},
+	Address = {New York, NY, USA},
+	Author = {Lakkaraju, Himabindu and Kleinberg, Jon and Leskovec, Jure and Ludwig, Jens and Mullainathan, Sendhil},
+	Booktitle = {Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+	Date-Added = {2019-08-02 06:37:23 +0000},
+	Date-Modified = {2019-08-02 06:37:23 +0000},
+	Doi = {10.1145/3097983.3098066},
+	Isbn = {978-1-4503-4887-4},
+	Keywords = {evaluating machine learning algorithms, selective labels, unmeasured confounders, unobservables},
+	Location = {Halifax, NS, Canada},
+	Numpages = {10},
+	Pages = {275--284},
+	Publisher = {ACM},
+	Series = {KDD '17},
+	Title = {The Selective Labels Problem: Evaluating Algorithmic Predictions in the Presence of Unobservables},
+	Url = {http://doi.acm.org/10.1145/3097983.3098066},
+	Year = {2017},
+	Bdsk-Url-1 = {http://doi.acm.org/10.1145/3097983.3098066},
+	Bdsk-Url-2 = {http://dx.doi.org/10.1145/3097983.3098066}}
+
+@article{jung2018algorithmic,
+	Author = {Jung, Jongbin and Shroff, Ravi and Feller, Avi and Goel, Sharad},
+	Date-Added = {2019-08-02 06:37:23 +0000},
+	Date-Modified = {2019-08-02 06:37:23 +0000},
+	Journal = {arXiv preprint arXiv:1805.01868},
+	Title = {Algorithmic decision making in the presence of unmeasured confounding},
+	Year = {2018}}
+
+@article{dearteaga2018learning,
+	Author = {De-Arteaga, Maria and Dubrawski, Artur and Chouldechova, Alexandra},
+	Date-Added = {2019-08-02 06:37:23 +0000},
+	Date-Modified = {2019-08-02 06:37:23 +0000},
+	Journal = {arXiv preprint arXiv:1807.00905},
+	Title = {Learning under selective labels in the presence of expert consistency},
+	Year = {2018}}
diff --git a/paper/macros.tex b/paper/macros.tex
index 217f428..c2d7e76 100755
--- a/paper/macros.tex
+++ b/paper/macros.tex
@@ -45,6 +45,10 @@
 \newcommand{\outcome}{\ensuremath{Y}\xspace}
 \newcommand{\outcomeValue}{\ensuremath{y}\xspace}
 \newcommand{\doop}[1]{\ensuremath{\mathbf{do}(#1)}}
+\newcommand{\unobservable}{\ensuremath{Z}\xspace}
+\newcommand{\unobservableValue}{\ensuremath{z}\xspace}
+\newcommand{\invlogit}{\text{logit}^{-1}}
+
 
 \newcommand{\generalPerformance}{\ensuremath{\mathbf{gp}}\xspace}
 \newcommand{\empiricalPerformance}{\ensuremath{\mathbf{ep}}\xspace}
diff --git a/paper/sl.tex b/paper/sl.tex
index 502f1f0..dbb3637 100755
--- a/paper/sl.tex
+++ b/paper/sl.tex
@@ -53,7 +53,8 @@
 
 
 \begin{abstract}
-We show how a causality-based approach can be used to estimate the performance of prediction algorithms in `selective labels' settings -- with particular application to `bail-or-jail' judicial decisions.
+%We show how a causality-based approach can be used to estimate the performance of prediction algorithms in `selective labels' settings -- with particular application to `bail-or-jail' judicial decisions.
+.
 \end{abstract}
 
 
@@ -66,177 +67,289 @@ We show how a causality-based approach can be used to estimate the performance o
 \renewcommand{\shortauthors}{Authors}
 
 
-\section{Introduction}
+\section{Introduction} 
+
+\begin{itemize}
+\item What we study
+	\begin{itemize}
+		\item We studied methods to evaluate the performance of predictive algorithms/models when the historical data suffers from selective labeling and unmeasured confounding.
+	\end{itemize}
+\item Motivation for the study
+	\begin{itemize}
+		\item Lot of decisions are being made which affects the course of human lives
+		\item Using computational models could enhance the decision-making process in terms of accuracy and fairness.
+		\item The advantage of using models does not necessarily lie in pure performance, that a machine can make multiple decisions, but rather in that a machine can learn from a vast set of information and that with care, a machine can be made a unbiased as possible.
+		\item The explainability of black-box models has been discussed in X
+		\item Selective labeling is an issue in multiple fields where machine learning algorithms could be deployed. (Loans, medicine, justice, insurance, ...)
+		\item Before deploying any algorithms, they should be audited to show that they actually improve on human decision-making.
+		\item Auditing algorithms in conventional settings is trivial, (almost) all of the labels available, numerous metrics have been proposed and are in use in multiple fields.
+	\end{itemize}
+\item Present the setting and challenge:
+	\begin{itemize}
+		\item `Selective labels' settings arise in situations where data are the product of a decision mechanism that prevents us from observing certain variables for part of the data.
+		\item A typical example is that of bail-or-jail decisions in judicial settings: a judge decides whether to grant bail to a defendant based on whether the defendant is considered likely to violate bail conditions while awaiting trial -- and therefore a violation might occur only in case bail is granted.
+		\item Such settings give rise to questions about the effect of alternative decision mechanisms  -- e.g., `how many defendants would violate bail conditions if more bail decisions were granted?'.
+		\item In other words, one faces the challenge to estimate the performance of an alternative, potentially automated, decision policy that might make different decisions than the one found in the judicial data.
+		\item Missing labels and decisions made by different deciders
+		\item Labels are missing non-randomly, decisions might be made by different deciders who differ in leniency.
+		\item (Note: our approach doesn't require multiple deciders)
+		\item In settings like judicial bail decisions, some outcomes cannot be observed due to the nature of the decisions. This results in a complicated missing data problem where the missingness of an item is connected with its outcome and where the available labels are a non-random sample of the underlying population. Recently this problem has been named the selective labels problem.
+	\end{itemize}
+\item Related work
+	\begin{itemize}
+		\item Lakkaraju presented contraction which performed well compared to other methods previously presented in the literature. We wanted to benchmark our approach to that and show that we can improve on their algorithm in terms of restrictions and accuracy. 
+		\item Jung et al presented their method for constructing optimal policy, we show that that approach can be applied to the selective labels setting.
+		\item They didn't have selective labeling nor did they consider that the judges would differ in leniency.
+		\item Selection bias has been extensively discussed in the causal inference literature (Pearl, Bareinboim etc.)
+	\end{itemize}
+\item Our contribution
+	\begin{itemize}
+	\item In this paper we propose a (novel modular) framework for presenting these missing data problems by breaking it into different modules and explicating their function.
+	\item In addition, we present an approach for inferring the missing labels to evaluate the performance of predictive models in settings where selective labeling and latent confounding is present. We use a flexible Bayesian approach to estimate the failure rate of a given model.
+	\item We show that our method is robust against violations and modifications in the data generating mechanisms.
+	\end{itemize}
+\end{itemize}
 
-`Selective labels' settings arise in situations where data are the product of a decision mechanism that prevents us from observing certain variables for part of the data.
-A typical example is that of bail-or-jail decisions in judicial settings: a judge decides whether to grant bail to a defendant based on whether the defendant is considered likely to violate bail conditions while awaiting trial -- and therefore a violation might occur only in case bail is granted.
-Such settings give rise to  questions about the effect of alternative decision mechanisms  -- e.g., `how many defendants would violate bail conditions if more bail decisions were granted?'.
-In other words, one faces the challenge to estimate the performance of an alternative, potentially automated, decision policy that might make different decisions than the one found in the judicial data.
 
-The challenge was addressed by Lakkaraju et.al. in \cite{lakkaraju2017selective}, in a setting that involved multiple judges of varying leniency, and under the assumption that defendants are assigned to judges randomly. Lakkaraju et.al. estimate the performance of an automated decision-making algorithm (`algorithm', for short) via a technique they call `contraction' - it proceeds as follows:
+\section{Framework}
+
 \begin{itemize}
-	\item It considers a set of judges with same number $N$ of judged defendants each.
-	\item Judges are ordered from most lenient (most bail decisions) to least lenient. 
-		Let $n_i$ be the number of bail decisions for judge $\#i$. We have $n_{i+1} \leq n_i$.
-	\item The algorithm considers the $n_i$ defendants that were granted bail by the $i$-th judge.
-	\item It keeps the $n_{i+1} \leq n_i$ defendants that it finds most likely to violate the bail.
-	\item It makes its own bail-or-jail decision for each of those $n_{i+1}$ defendants.
-	\item Its performance is measured as the number of defendants that it decides to bail but who, according to the data, eventually violated the bail.
-	\item Its performance is compared to the performance of judge $\#(i+1)$, based on the cases they bailed.
+\item Definitions \\
+	In this paper we apply our approach on binary outcomes, but our approach is readily modifiable to accompany continuous or categorical responses. Then we could use e.g. sum of squared errors or other appropriate metric as the measure for performance.
+	\begin{itemize}
+	\item Failure rate
+		\begin{itemize}
+		\item Failure rate (FR) is defined as the ratio of undesired outcomes to given decisions. One special characteristic of FR in this setting is that a failure can only occur with a positive decision / we can only observe the outcome when the corresponding decision is positive.
+		\item That means that a failure rate of zero can be achieved just by not giving any positive decisions but that is not the ultimate goal.
+		\end{itemize}
+	\item Acceptance rate
+		\begin{itemize}
+		\item Acceptance rate (AR) is defined as the ratio of positive decisions to all decisions that a decision-maker will give.
+		\item In some settings, (justice, medicine) people might want to find out if X\% are accepted what is the resulting failure rate, and what would be the highest acceptance rate to have to have the failure rate at an acceptable level. 
+		\item We want to know the trade-off between acceptances and failure rate.
+		\item Lakkaraju mentioned the problem in the data that judges which have a higher leniency have labeled a larger portion of the data (which might results in bias).
+		\end{itemize}
+	\item With decider or decision-maker we might refer to a judge, a doctor, ... who makes the decisions on which labels are available. Some deciders might have an incentive for positive decisions if it can mean e.g. savings. Judge makes saving by not jailing a defendant. Doctor makes savings by not assigning patient for a higher intensity care. (move to motivation?)
+	\item With unobservables we refer to some latent, non-written information regarding a certain outcome that is only available to the decision-maker.
+\end{itemize}
+\item Modules \\
+	We separated steps that modify the data into separate modules in order to formally define their inner workings. Modules have different functions, inputs and outputs. Modules are interchangeable with a similar type of module (You can change decider module of type A with decider module of type B).
+	\begin{itemize}
+	\item Decider modules
+		\begin{itemize}
+		\item In general, the decider module assigns predictions to the observations based on some information.
+		\item The information available to a decider in the decider module includes observable and -- possibly -- unobservable features, denoted with X and Z respectively.
+		\item The predictions given by a decider module can be relative or absolute. With relative predictions we refer to that a decider module can give out a ranking or an ordering of the subjects based on their predicted tendency towards an outcome. Absolute predictions can be either binary or continuous in nature. They can correspond to yes or no decisions or to a probability value.
+		\item Inner workings of the module may or may not be known. In observational data sets, the mechanism or the decider which has labeled the data is usually unknown.
+		\item The decider (module) in the data step has unobservable information available for making the decisions. 
+		\item The behaviour of the decider module in the data step can be defined in many ways. We have used both the method presented by Lakkaraju et al. and two methods of our own. We created these two deciders to remove the interdependencies of the decisions made by the decider Lakkaraju et al. presented.
+		\item The difference between the deciders in the data and modelling steps is that usually we cannot observe all the information that has been available to the decider in the data step as opposed to the decider in the modelling step. In addition, we usually cannot observe the full decision-making process of the decider in the data step contrary to the decider in the modelling step.
+		\end{itemize}
+	\item Evaluator modules
+		\begin{itemize}
+		\item Evaluator module gets the decisions, observable features of the subject and predictions made by the deciders in the data and modelling
+		\item The evaluator module outputs a reliable estimate of a decider module's performance. The estimate is created by the evaluator module and it should be precise and unbiased and it should have a low variance. The output of the evaluator module should also be as robust as possible to slight changes in the data generation. The estimate of the evaluator should also be accurate for all levels of leniency of the deciders.
+		\end{itemize}
+	\end{itemize}
+%\item Example: in observational data sets, the deciders have already made decision concerning the subjects and we have a selectively labeled data set available. In the modular framework we refer to the actions of the human labelers as a decider module which has access to latent information. 
+\item Problem formulation \\
+Given the selective labeling of data and the latent confounders present, our goal is to create an evaluator module that can output a reliable estimate of a given decider module's performance. We use acceptance rate and failure rate as measures against which we compare our evaluators because they have direct and easily understandable counterparts in the real world / applicable domains. The evaluator module should be able to accurately estimate the failure rate for all levels of leniency and all data sets.
+
+The "eventual goal" is to create such an evaluator module that it can outperform (have a lower failure on all levels of acceptance rate) the deciders in the data generating process. The problem is of course comparing the performance of the deciders. We try to address that.
+
+(It's important to somehow keep these two different goals separate.)
 \end{itemize}
-The above procedure gives us a comparison between the performance of the algorithm to that of judges at the $n_{i+1}/N$ leniency level (leniency measured as the rate of bail decisions).
-A major drawback of the {\it contraction} technique is that it requires data to include judges at a given leniency level.
-
-In this document, we describe a different approach based on causal analysis, that allows us to estimate the performance of a decision-making system at any leniency level.
-
-\section{Setting}
-
-Consider a judge who decides whether to grant bail to a defendant based on whether the defendant is considered likely to violate bail conditions while awaiting trial.
-We use variable \decision to store the outcome of the bail-or-jail decision, with $\decision = 1$ denoting a bail decision and $\decision = 0$ a jail decision.
-Whether the defendant violates the bail conditions depends on the bail-or-jail decision \decision and the features \features of the defendant.
-
-The decision is based on the following variables. First, the features \features of the defendant, which we assume to be observed.
-Secondly, the leniency of the judge, expressed as a variable \leniency.
-Specifically, we assume that every judge evaluates a given candidate according to the probability 
-\[
-\prob{\outcome = 0 | \features = \featuresValue, \doop{\decision = 1}} 
-\]
-that the candidate will violate bail conditions (\outcome = 0) if they were granted bail.
-We write \outcome = 0 to refer to the case when the defendant does not violate bail, whether bail is granted or not.
-The \doop{condition} expression signifies that, in evaluating the probability, we consider the event where the condition  (here, it is the condition $\decision = 1$) is imposed to the data-generation process (and therefore alters the generative model).
-In addition, we assume that every judge would assign the same value to the above probability, given by a function \score{\featuresValue}.
-\[
-\score{\featuresValue} = \prob{\outcome = 0 | \features = \featuresValue, \doop{\decision = 1}}
-\]
-The assumption that, essentially, all judges have the same model for the probability that a defendant would violate bail is not far-fetched for the purposes of our analysis, particularly taking into account that \score{\featuresValue} can be learned from the observed data
-\[
-\prob{\outcome = 0 | \features = \featuresValue, \doop{\decision = 1}} = \prob{\outcome = 0 | \features = \featuresValue, \decision = 1}
-\]
-and that data are publicly accessible, allowing us to assume that all judges have access to the same information.
-Where judges {\it do differ} is at the level of their leniency \leniency.
-Following the above assumptions, a judge with leniency \leniency = \leniencyValue grants bail to the defendants for which $F(\featuresValue) < r$, where $F$ is the cumulative distribution.
-\begin{equation}
-	F(\featuresValue_0) = \int { \indicator{\prob{\outcome = 0| \decision = 1, \features = \featuresValue} > \prob{\outcome = 0| \decision = 1, \features = \featuresValue_0}}	d\prob{\featuresValue}	} 
-\end{equation}
-
-which can be written as
-
-\begin{equation}
-	F(\featuresValue_0) = \int {\prob{\featuresValue}  \indicator{\prob{\outcome = 0| \decision = 1, \features = \featuresValue} > \prob{\outcome = 0| \decision = 1, \features = \featuresValue_0}} d\featuresValue}
-\end{equation}
-
-\note[RL]{
-	Should the inequality be reversed? With some derivations
-	\begin{equation}
-	F(\featuresValue_0) = \int {\prob{\featuresValue}  \indicator{\score{\featuresValue} < \score{\featuresValue_0} } ~ d\featuresValue}
-\end{equation}
-}
 
+\section{Counterfactual-Based Imputation For Selective Labels}
 
-The bail-or-jail scenario is just one example of settings that involve a decision $\decision \in\{0,1\}$ that is based on individual features \features and leniency (acceptance rate) \leniency -- and where a behavior of interest \outcome is observed only for the cases where \decision = 1.
-The diagram of the causal model is shown in Figure~\ref{fig:causalmodel}.
-Our results are applicable to other scenarios with same causal model.
-
-\begin{figure}
-\begin{center}
-\includegraphics[width=\columnwidth]{img/causalmodel.png}
-\end{center}
-\caption{Causal model.}
-\label{fig:causalmodel}
-\end{figure}
-
-\subsection{Analysis Task}
-
-We will use existing machine-learning techniques from the literature to learn function \score{\featuresValue}, with the goal to build a decision system that outperforms judges.
-The challenge we face is to estimate accurately the performance of the decision system -- given that we are in a `selective labels' setting.
-Performance is measured {\it for a given leniency level} as the rate at which bail is granted {\it and} the defendant violates it.
-In other words, performance is measured as the probability that a decision lead to undesired outcome.
-
-\section{Analysis}
-
-We wish to calculate the probability of undesired outcome (\outcome = 0) at a fixed leniency level.
-\begin{align*}
-& \prob{\outcome = 0 | \doop{\leniency = \leniencyValue}} = \nonumber \\
-& = \sum_\decisionValue \prob{\outcome = 0, \decision = \decisionValue | \doop{\leniency = \leniencyValue}} \nonumber \\
-& = \prob{\outcome = 0, \decision = 0 | \doop{\leniency = \leniencyValue}} + \prob{\outcome = 0, \decision = 1 | \doop{\leniency = \leniencyValue}} \nonumber \\
-& = 0 + \prob{\outcome = 0, \decision = 1 | \doop{\leniency = \leniencyValue}} \nonumber \\
-& = \prob{\outcome = 0, \decision = 1 | \doop{\leniency = \leniencyValue}} \nonumber \\
-& = \sum_\featuresValue \prob{\outcome = 0, \decision = 1, \features = \featuresValue | \doop{\leniency = \leniencyValue}} \nonumber \\
-& = \sum_\featuresValue \prob{\outcome = 0, \decision = 1 | \doop{\leniency = \leniencyValue}, \features = \featuresValue} \prob{\features = \featuresValue | \doop{\leniency = \leniencyValue}} \nonumber \\
-& = \sum_\featuresValue \prob{\outcome = 0, \decision = 1 | \doop{\leniency = \leniencyValue}, \features = \featuresValue} \prob{\features = \featuresValue} \nonumber \\
-& = \sum_\featuresValue \prob{\outcome = 0 | \decision = 1, \doop{\leniency = \leniencyValue}, \features = \featuresValue} \prob{\decision = 1 | \doop{\leniency = \leniencyValue}, \features = \featuresValue} \prob{\features = \featuresValue} \nonumber \\
-& = \sum_\featuresValue \prob{\outcome = 0 | \decision = 1, \features = \featuresValue} \prob{\decision = 1 | \leniency = \leniencyValue, \features = \featuresValue} \prob{\features = \featuresValue}
-\end{align*}
-
-\antti{Here one can drop do even at the first line according to do-calculus rule 2, i.e. $P(Y=0|do(R=r))=P(Y=0|R=r)$. However, do-calculus formulas should be computed by first learning a graphical model and then computing the marginals using the graphical model. This gives more accurate result. Michael's complicated formula essentially does this, including forcing $P(Y=0|T=0,X)=0$ (the model supports context-specific independence $Y \perp X | T=0$.)}
-
-Expanding the above derivation for model \score{\featuresValue} learned from the data
-\[
-\score{\featuresValue} = \prob{\outcome = 0 | \features = \featuresValue, \decision = 1},
-\]
-the {\it generalized performance} \generalPerformance of that model is given by the following formula.
-\begin{equation}
-\generalPerformance = \sum_\featuresValue \score{\featuresValue} \indicator{F(\featuresValue) < r} \prob{\features = \featuresValue}
-\label{eqn:gp}	
-\end{equation}
-Equation~\ref{eqn:gp} can be calculated for a given model \datadistr{\featuresValue} = \prob{\features = \featuresValue} of individual features.
-Alternatively, we can have an empirical measure \empiricalPerformance of performance over the $\datasize$ data points in dataset \dataset, given by the following equation.
-\begin{equation}
-\empiricalPerformance = \frac{1}{\datasize} \sum_{(\featuresValue, \outcomeValue)\in\dataset}  \score{\featuresValue} \indicator{F(\featuresValue) < r} 
-\label{eqn:gp}	
-\end{equation}
-
-\subsection{Comments}
-Roughly speaking, the above formulas should work well if `bail' cases (\decision = 1) cover well the area spanned by the observed features of defendants -- i.e., we do not have large areas of \features with no or too few bail cases.
-
-If there are such areas, then we cannot do much about the lack of data. 
-One reasonable modeling choice, however, is to impose the following priors on \score{\featuresValue}: 
-\begin{enumerate}
-	\item $\score{\featuresValue} \approx 1$ for areas near values of \features for which we have observed data but few bail decisions (i.e., we assume a-priori that a defendant is more likely to violate bail -- a belief that will change if the data tell us otherwise);
-	\item $\score{\featuresValue} \approx 0$ for areas near unobserved values of \features (i.e., we assume that people who are unlikely to ever be taken to court would probably `play nice' and not violate bail).
-\end{enumerate}
-
-Lack of data for large areas of \features is a potential problem for the {\it contraction} technique of Lakkaraju et.al., as well.
-Unlike contraction, though, our approach does not require to have data at all leniency levels.
-Moreover, it is easy to see based on the derivations of Eq.\ref{eqn:gp} that our approach would work identically in the case where defendants are not assigned to judges at random (i.e., if there was a causal relation $\features\rightarrow\leniency$).
-
-\section{Results}
-
-Below we present our results in various settings. Models are evaluated in contrast to the following quantities:
 \begin{itemize}
-\item {\it True evaluation:} Depicts the true performance of the predictive model. Constructed by sorting all the labels in the test data (even the ones hidden from the models) by the predicted probabilities and then simulating the acceptance rate at the given level. (Note: True evaluation can only be evaluated on synthetic data sets.)
-\item {\it Labeled outcomes:} Similar to {\it true evaluation} but only available labels with positive decisions $(\decision = 1)$ are used.
-\item {\it Human evaluation:} Human decision makers with similar acceptance rates are grouped and treated as a single decision maker.
-\item {\it Contraction:} Contraction curve was constructed as explained by Lakkaraju et al. \cite{lakkaraju2017selective}.
-\item {\it Causal model, ep:} Curve presents the predicted probability $\prob{\outcome = 0 | \doop{\leniency = \leniencyValue}}$ at various levels of acceptance rate.
+\item Theory \\ (Present here (1) what counterfactuals are, (2) motivation for structural equations, (3) an example or other more easily approachable explanation of applying them, (4) why we used computational methods)
+	\begin{itemize}
+	\item Counterfactuals are 
+		\begin{itemize}
+		\item hypothesized quantities that encode the would-have-been relation of the outcome and the treatment assignment.
+		\item Using counterfactuals, we can discuss hypothetical events that didn't happen. 
+		\item Using counterfactuals requires defining a structural model
+		\item Pearl's Book of Why "The fundamental problem"
+		\end{itemize}
+	\item By defining structural equations / a graph
+		\begin{itemize}
+		\item we can begin formulating causal questions to get answers to our questions.
+		\item Once we have defined the equations, counterfactuals obtained by...
+		\item We denote the counterfactual "Y had been y had T been t" with...
+		\item By first estimating the distribution of the latent variable Z we can impose 
+		\item Now counterfactuals can be defined as
+			\begin{definition}[Unit-level counterfactuals \cite{pearl2010introduction}]
+			Let $M$ be a structural model and $M_x$ a modified version of $M$, with the equation(s) of $X$ replaced by $X = x$. Denote the solution for $Y$ in the equations of $M_x$ by the symbol $Y_{M_x}(u)$. The counterfactual $Y_x(u)$ (Read: "The value of Y in unit u, had X been x") is given by:
+			\begin{equation} \label{eq:counterfactual}
+				Y_x(u) := Y_{M_x}(u)
+			\end{equation}
+			\end{definition}
+		\end{itemize}
+	\item In a high level
+		\begin{itemize}
+		\item there is usually some data recoverable from the unobservables. For example, if the observable attributes are contrary to the outcome/decision we can claim that the latent variable included some significant information.
+		\item We retrieve this information using the prespecified structural equations. After estimating the desired parameters, we can estimate the value of the counterfactual (not observed) outcome by switching the value of X and doing the computations through the rest of the graph...
+		\end{itemize}
+	\item Recent advances in the computational methods provide us with ways of inferring the value of the latent variable by applying Bayesian techniques to... Previously this kind of analysis required us to define X and compute Y...
 \end{itemize}
+\item Model (Structure, equations in a general and more specified level, assumptions, how we construct the counterfactual...) 
+	\begin{itemize}
+	\item Structure is as is in the diagram. Square around Z represents that it's unobservable/latent
+	The features of the subjects include observable and -- possibly -- unobservable features, denoted with X and Z respectively. The only feature of a decider is their leniency R (depicting some baseline probability of a positive decision). The decisions given will be denoted with T and the resulting outcomes with Y, where 0 stands for negative outcome or decision and 1 for positive.
+	\item The causal diagram presents how decision T is affected by the decider's leniency (R), the subject's observable private features (X) and the latent information regarding the subject's tendency for a negative outcome (Z). Correspondingly the outcome (Y) is affected only by the decision T and the above-mentioned features X and Z. 
+	\item The causal directions and implied independencies are readable from the diagram. We assume X and Z to be independent.
+	\item The structural equations connecting the variables can be formalized in a general level as (see Jung)
+		\begin{align} \label{eq:structural_equations}
+		\outcome(0) & = 1 / NA? \\ \nonumber
+		\outcome(1) & \sim f(\featuresValue, \unobservableValue; \beta_{\featuresValue\outcomeValue}, \beta_{\unobservableValue\outcomeValue}) \\ \nonumber
+		\decision      & \sim g(\featuresValue, \unobservableValue; \beta_{\featuresValue\decisionValue}, \beta_{\unobservableValue\decisionValue}, \alpha_j), \\ \nonumber
+		\outcome & =\outcome(\decisionValue)\\ \nonumber
+		\end{align}
+	where the beta and alpha coefficients are the path coefficients specified in the causal diagram
+	\item This general formulation of the selective labels problem enables the use of this approach even when the outcome is not binary. Notably this approach -- compared to that of Jung et al. -- explicates the selective labels issue to the structural equations when we deterministically set the value of outcome y to be one in the event of a negative decision. In addition, we allow the judges to differ in the baseline probabilities for positive decisions, which is by definition leniency.
+	\item Now by imposing a value for the decision \decision we can obtain the counterfactual by simply assigning the desired value to the equations in \ref{eq:structural_equations}. This assumes that... (Consistency constraint) Now we want to know {\it what would have been the outcome \outcome for this individual \featuresValue had the decision been $\decision = 1$, or more specifically $\outcome_{\decision = 1}(\featuresValue)$}.
+	\item To compute the value for the counterfactuals, we need to obtain estimates for the coefficients and latent variables. We specified a Bayesian (/structural) model, which requires establishing a set of probabilistic expressions connecting the observed quantities to the parameters of interest. The relationships of the variables and coefficients are presented in equation \ref{eq:structural_equations} and figure X in a general level. We modelled the observed data as  
+		\begin{align} \label{eq:data_model}
+		 y(1) & \sim \text{Bin}(1, \invlogit(\beta_{xy,k[i]} x + \beta_{zy,k[i]} z_i)) \\ \nonumber
+		 t & \sim \text{Bin}(1, \invlogit(\alpha_{j[i]} + \beta_{xt,k[i]} x + \beta_{zt,k[i]}z)). \\ \nonumber
+		\end{align}
+	\item Bayesian models also require the specification of prior distributions for the variables of interest to obtain an estimate of their distribution after observations, the posterior distribution.
+	\item Identifiability of models with unobserved confounding has been discussed by eg McCandless et al and Gelman. As by Gelman we note that scale-invariance has been tackled with specifying the priors.  (?)
+	\item Specify, motivate and explain priors here if space.
+	\end{itemize}
+\item Computation (Stan in general, ...)
+	\begin{itemize}
+	\item Using the model specified in equation X, we used Stan to estimate the intercepts, path coefficients and latent variables. Stan provides tools for efficient computational estimates of posterior distributions.  Stan uses No-U-Turn Sampling (NUTS), an extension of Hamiltonian Monte Carlo (HMC) algorithm, to computationally estimate the posterior distribution for inferences. (In a high level, the sampler utilizes the gradient of the posterior to compute potential and kinetic energy of an object in the multi-dimensional surface of the posterior to draw samples from it.) Stan also has implementations of black-box variational inference algorithms and direct optimization algorithms for the posterior distribution but they were deemed to be insufficient for estimating the posterior in this setting
+	\item Chain lengths were set to X and number of chains deployed was Y. (Explain algorithm fully later)
+	\end{itemize}
+\end{itemize}
+
+\section{Extension To Non-Linearity (2nd priority)}
+
+\section{Related work}
 
-\subsection{Without unobservables}
+\begin{itemize}
+\item Lakkaraju and contraction. \cite{lakkaraju2017selective}
+\item Counterfactuals/Potential outcomes. \cite{pearl2010introduction} (also Rubin)
+\item Approach of Jung et al for optimal policy construction. \cite{jung2018algorithmic}
+\item Discussions of latent confounders in multiple contexts.
+\item Imputation methods and other approaches to selective labels, eg. \cite{dearteaga2018learning}
+\end{itemize}
 
-The causal model for this scenario corresponds to that depicted in Figure \ref{fig:causalmodel}.
-For the analysis, we assigned 500 subjects to each of the 100 judges randomly.
-Every judge's leniency rate $\leniency$ was sampled uniformly from a half-open interval $[0.1; 0.9)$. 
-Private features $\features$ were defined as i.i.d standard Gaussian random variables.
-Next, probabilities for negative results $\outcome = 0$ were calculated as
-\[
-\prob{\outcome = 0| \features = \featuresValue} = \dfrac{1}{1+\exp\{-\featuresValue\}}.
-\]
-and then the result variable $\outcome$ was sampled from Bernoulli distribution with parameter $1-\frac{1}{1+\exp\{-\featuresValue\}}$.
+\section{Experiments}
 
-The decision variable $\decision$ was set to 0 if the probability $\prob{\outcome = 0| \features = \featuresValue}$ resided in the top $(1-\leniencyValue)\cdot 100 \%$ of the subjects appointed for that judge.
-Results for estimating the causal quantity $\prob{\outcome = 0 | \doop{\leniency = \leniencyValue}}$ with various levels of leniency $\leniencyValue$ under this model are presented in Figure \ref{fig:without_unobservables}.
+In this section we present our results from experiments with synthetic and realistic data. We show that our approach provides the best estimates for evaluating the performance of a predictive model on all levels of leniency.
 
-\begin{figure}
-\begin{center}
-\includegraphics[width=\columnwidth]{img/without_unobservables.png}
-\end{center}
-\caption{$\prob{\outcome = 0 | \doop{\leniency = \leniencyValue}}$ with varying levels of acceptance rate without unobservables. Error bars denote standard error of the mean.}
-\label{fig:without_unobservables}
-\end{figure}
+\subsection{Synthetic data}
 
+ (RL: I presume MM's preferences were that the outcome would be from Bernoulli distribution and that the decisions would be independent. So, let's first explain those ways thoroughly and then mention what we changed as discussed.)
 
-% \textbf{Acknowledgments.}
+\begin{itemize}
+\item Data generation
+	\begin{itemize}
+	\item We experimented with synthetic data sets to show that our method is accurate, unbiased and low variance.
+	\item We imitated the data generation process presented by Lakkaraju et al. for the benchmarking to make sense.
+	\item We created data by sampling N=50k observations from three independent standard gaussians.
+	\item The observations were assigned to variables X, Z, and W.
+	\item We then drew the outcome Y from a Bernoulli distribution with parameter p=1 -- logit-1(...) 
+	\item This is one data generation module. It can be / was modified by changing the outcome producing mechanism.
+	\item Next, the decisions were assigned by computing the quantile the subject belongs to. The quantile was obtained as the inverse cdf of ... . 
+	\item This way the observations were independent and the still the leniency would be a good estimate of the acceptance rate. (The acceptance rate would stochastically converge to the leniency.)
+	\item This is a decider module. We experimented with different combinations of decider and data generating modules to show X / see Y. (to see that our method is robust against non-informative, biased and bad decisions . Due to space constraints we defer these results...)
+	\end{itemize}
+\item Algorithms \\
+	We deployed multiple evaluator modules to estimate the true failure rate of the decider module. The estimates should be close to the true evaluation evaluator modules estimates and the estimates will eventually be compared to the human evaluation curve. 
+	\begin{itemize}
+	\item True evaluation
+		\begin{itemize}
+		\item Depicts the true performance of the model. "How well would this model perform had it been deployed?" Not available in real data. Calculated by ordering the observations based on the predictions from the black-box model B and counting the failure rate from the ground truth labels.
+		\end{itemize}
+	\item Human evaluation
+		\begin{itemize}
+		\item The performance of the deciders in the data generation step. We binned deciders with similar values of leniency and counted their failure rate.
+		\item In observational data sets, we can only record the decisions and acceptance rates of these decision-makers. 
+		\item This curve is eventually the benchmark for the performance of a model.
+		\end{itemize}
+	\item Labeled outcomes
+		\begin{itemize}
+		\item Vanilla estimator of a model's performance. Obtained by first ordering the observations by the predictions assigned by the decider in the modelling step.
+		\item Then 1-r \% of the most dangerous are detained and given a negative decision. The failure rate is computed as the ratio of negative outcomes to the number of subjects.
+		\end{itemize}
+	\item Contraction
+		\begin{itemize}
+		\item Algorithm by Lakkaraju et al. Depends on the random assigning of subjects to judges and requires that the judges differ in leniency. 
+		\item Can estimate the true failure only to q.
+		\item Performance is affected by the number of people judged by the most lenient decision-maker, the agreement rate and the leniency of the most lenient decision-maker.
+		\item Works only on binary outcomes
+		\item (We show that our method isn't constrained by any of these)
+		\item The algorithm goes as follows...
+		\end{itemize}
+	\item Potential outcomes / CBI
+		\begin{itemize}
+		\item Take test set
+		\item Estimate the parameters from equations XX
+		\item Using the posterior predictive distribution, obtain a point estimate for the failure rate
+		\item Estimates for the counterfactuals Y(1) for the unobserved values of Y were obtained using the posterior expectations from Stan. We used the NUTS sampler to estimate the posterior. When the values for...
+		\end{itemize}
+	\end{itemize}
+\item Results 
+(Target for this section from problem formulation: show that our evaluator is unbiased/accurate (show mean absolute error), robust to changes in data generation (some table perhaps, at least should discuss situations when the decisions are bad/biased/random = non-informative or misleading), also if the decider in the modelling step is bad and its information is used as input, what happens.)
+	\begin{itemize}
+	\item Accuracy: we have defined two metrics, acceptance rate and failure rate. In this section we show that our method can accurately restore the true failure on all acceptance rates with low mean absolute error. As figure X shows are method can recover the true performance of the predictive model with good accuracy. The mean absolute errors w.r.t the true evaluation were 0.XXX and 0.XXX for contraction and CBI approach respectively. 
+	\item In figure X we also present how are method can track the true evaluation curve with a low variance.
+	\end{itemize}
+\end{itemize}
+
+\subsection{Realistic data}
+In this section we present results from experiments with (realistic) data sets. 
+
+\begin{itemize}
+\item COMPAS data set
+	\begin{itemize}
+	\item Size, availability, COMPAS scoring
+		\begin{itemize}
+		\item COMPAS = Correctional Offender Management Profiling for Alternative Sanctions is Northpointe's (now diff. name) tool for guiding decisions in the criminal justice system.
+		\item COMPAS general recidivism risk score is made to predict recidivism in the following two years,
+		\item Data comprises of 6172 subjects assessed at Broward county, California. 
+		\item Data was made available and was preprocessed by ProPublica.
+		\item Their analysis and results are presented in the original article "Machine Bias".
+		\item Data includes the subjects' demographic information (incl. gender, age, race) and information on their previous offences. 
+		\end{itemize}
+	\item Subsequent modifications for analysis 
+		\begin{itemize}
+		\item We created 9 synthetic judges with leniencies 0.1, 0.2, ..., 0.9. 
+		\item Subjects were distributed to all the judges evenly and at random to enable comparison to contraction method
+		\item We employed similar decider module as explained in Lakkaraju's paper, input was the COMPAS Score 
+		\item As the COMPAS score is derived mainly from "prior criminal history, criminal associates, drug involvement, and early indicators of juvenile delinquency problems" so it can be said to have external information available, not coded into the four above-mentioned variables. (quoted text copy-pasted from here)
+		\item Data was split to test and training sets
+		\item A logistic regression model was built to predict two-year recidivism from categorized age, gender, the number of priors, degree of crime COMPAS screened for (felony/misdemeanor)
+		\end{itemize}
+	\item Results
+		\begin{itemize}
+		\item Results from this analysis is presented in figure X. In the figure we see that CBI follows the true evaluation curve very closely when there are more than K=? groups. Our approach also has a lower standard deviation.
+		\item We can also deduce from the figure that if this predictive model was to be deployed, it wouldn't necessarily improve on the decisions made by these synthetic judges.
+		\item We experimented also with different values of prior variance, but it didn't affect the results.
+		\end{itemize}
+	\end{itemize}
+\item Catalonian data (this could just be for our method? Hide ~25\% of outcome labels and show that we can estimate the failure rate for ALL levels of leniency despite the leniency of this one judge is only 0.25) (2nd priority)
+	\begin{itemize}
+	\item Size, availability, RisCanvi scoring
+	\item Subsequent modifications for analysis
+	\item Results
+	\end{itemize}
+\end{itemize}
 
+\section{Discussion}
+
+\begin{itemize}
+\item Conclusions 
+\item Future work / Impact
+\end{itemize}
+
+
+% \textbf{Acknowledgments.}
+%The computational resources must be mentioned. 
 
 %\clearpage
 % \balance
-- 
GitLab