appendix.tex

%!TEX root = sl.tex

\appendix

%\section{Technical details}

%\note{Riku}{From KDD: ''In addition, authors can provide an optional two (2) page supplement at the end of their submitted paper (it needs to be in the same PDF file and start at page 10) focused on reproducibility. This supplement can only be used to include (i) information necessary for reproducing the experimental results, insights, or conclusions reported in the paper (e.g., various algorithmic and model parameters and configurations, hyper-parameter search spaces, details related to dataset filtering and train/test splits, software versions, detailed hardware configuration, etc.), and (ii) any pseudo-code, or proofs that due to space limitations, could not be included in the main nine-page manuscript, but that help in reproducibility (see reproducibility policy below for more details).''}

%Specify the following for the technical appendix:

%\begin{itemize}
%\item Computing environment, versions of
%	\begin{itemize}
%	\item Python 3.6.9
%	\item Stan (PyStan v.2.19.0.0 with cmdstanpy 0.4.3)
%	\end{itemize}
%\item Full model specification
%\item Replication specifics, see above from their requirements
%	\begin{itemize}
%	\item Can we only share link to repository?
%	\item Contraction specification to appendix?
%	\end{itemize}
%\end{itemize}


\section{Counterfactual Inference}\label{sec:counterfactuals}

%\note{Antti}{Writing here in the language I know, to make the assumptions we are making clear.}

Here we derive Equation~\ref{eq:counterfactual_eq}, via Pearl's counterfactual inference protocol involving three steps: abduction, action, and inference \cite{pearl2000}. Our model can be represented with the following structural equations over the graph structure in Figure~\ref{fig:causalmodel}:
\begin{align}
\judge & := \epsilon_{\judge}, \quad 
% \nonumber \\
\unobservable := \epsilon_\unobservable, \quad   
 % \nonumber \\
 \obsFeatures := \epsilon_\obsFeatures,  \\ % \quad  \nonumber \\
\decision & := g(\human,\obsFeatures,\unobservable,\epsilon_{\decision }), \quad  %\nonumber\\
\outcome := f(\decision,\obsFeatures,\unobservable,\epsilon_\outcome).  \nonumber 
\end{align}
%\vspace{-5pt}
%\hrulefill
\noindent
For any cases where $\decision=0$ in the data, we calculate the counterfactual value of $\outcome$ if we had had $\decision=1$.  We assume here that all these parameters, functions and distributions are known.
In the \emph{abduction} step we determine $\prob{\epsilon_\human, \epsilon_\unobservable, \epsilon_\obsFeatures, \epsilon_{\decision},\epsilon_\outcome|\judgeValue,\obsFeaturesValue,\decision=0}$,  the distribution of the stochastic disturbance terms updated to take into account the observed evidence on the decision maker, observed features and the decision (given the decision $\decision=0$ disturbances are independent of $\outcome$). %At this point we make use of the additional information a negative decision has on the unobserved risk factor $Z$. 
We directly know $\epsilon_\obsFeatures=\obsFeaturesValue$ and $\epsilon_{_\judge}=\judgeValue$. 
%PROBLEM: As the next step of inferring outcome $\outcome$ is not affected by $\epsilon_{\decision}$ we do not need to take it into account. \acomment{Is this what happens?} \rcomment{See big note.}  
Due to the special form of $f$ the observed evidence is independent of $\epsilon_\outcome$ when $\decision = 0$. We only need to determine $\prob{\epsilon_\unobservable,\epsilon_{\decision}|\humanValue,\obsFeaturesValue,\decision=0}$.
Next, the \emph{action} step involves intervening on $\decision$ and setting $\decision=1$ by intervention.
Finally in the \emph{prediction} step we estimate $\outcome$:
\begin{eqnarray*}
&&\hspace{-10mm}E_{\decision \leftarrow 1}(\outcome|\judgeValue,\decision=0,\obsFeaturesValue)\\%&=&    %\int   f(T=1,x,Z=\epsilon_z,\epsilon_Y) \\
%&& P(Z=\epsilon_Z|R=\epsilon_R, T=0, x)
% P(\epsilon_Y) d\epsilon_Z d\epsilon_Y \\
 &=&  \hspace{-3mm}  \int   f(\decision=1,\obsFeaturesValue,\unobservable = \epsilon_\unobservable,\epsilon_\outcome)   \prob{\epsilon_\unobservable, \epsilon_\decision |\judgeValue,\decision=0,\obsFeaturesValue}
\prob{\epsilon_\outcome}  d\epsilon_{\unobservable} \diff{\epsilon_\outcome}\diff{\epsilon_\decision}\\
 &=&   \hspace{-3mm}   \int   \prob{\outcome=1|\decision=1,\obsFeaturesValue,\unobservableValue}  \prob{\unobservableValue|\judgeValue,\decision=0,\obsFeaturesValue} \diff{\unobservableValue}
\end{eqnarray*}
where we used $\epsilon_\unobservable=\unobservableValue$ and integrated out $\epsilon_\decision$ and $\epsilon_\outcome$. This gives us the counterfactual expectation of $Y$ for a single subject.


%\subsection{Counterfactual Inference (OLD with LENIENCY PROBLEM)}
%
%%\note{Antti}{Writing here in the language I know, to make the assumptions we are making clear.}
%
%Here we derive Equation~4, via Pearl's counterfactual inference protocol involving three steps: abduction, action, and inference. Our model can be represented with the following structural equations over the graph structure in Figure~2:
%
%\noindent
%\hrulefill
%\begin{align}
%\leniency_\human & := \epsilon_{\leniency_\human}, \quad   %\epsilon_r \sim N(0,\sigma_z^2)  
%\nonumber \\
%\unobservable & := \epsilon_\unobservable, \quad   %\epsilon_z \sim N(0,\sigma_z^2)
% \nonumber \\
% \obsFeatures & := \epsilon_\obsFeatures, \quad   %\epsilon_z \sim N(0,\sigma_z^2)
% \nonumber \\
%\decision_\human & := g(\leniency,\obsFeatures,\unobservable,\epsilon_{\decision_\human }),  \nonumber\\
%\outcome & := f(\decision_\human,\obsFeatures,\unobservable,\epsilon_\outcome).  \nonumber 
%\end{align}
%
%\vspace{-5pt}
%
%\hrulefill
%
%\noindent
%For any cases where $\decision_\human=0$ in the data, we calculate the counterfactual value of $\outcome$ if we had had $\decision_\human=1$.  We assume here that all these parameters, functions and distributions are known.
%In the \emph{abduction} step we determine $\prob{\epsilon_\leniency, \epsilon_\unobservable, \epsilon_\obsFeatures, \epsilon_{\decision_H},\epsilon_\outcome|\leniencyValue_\human,\obsFeaturesValue,\decision_H=0,\outcome=1}$,  the distribution of the stochastic disturbance terms updated to take into account the observed evidence on judge leniency, observed features and the decision. %At this point we make use of the additional information a negative decision has on the unobserved risk factor $Z$. 
%We directly know $\epsilon_\obsFeatures=\obsFeaturesValue$ and $\epsilon_{\leniency_\human}=\leniencyValue_\human$. As the next step of inferring outcome $\outcome$ is not affected by $\epsilon_{\decision_\human}$ we do not need to take it into account. \acomment{Is this what happens?} \rcomment{See big note.}  Due to the special form of $f$ the observed evidence is independent of $\epsilon_\outcome$ when $\decision_\human = 0$. We only need to determine $\prob{\epsilon_\unobservable| \leniencyValue_\human,\decision_\human=0,\obsFeaturesValue}$.
%Next, the \emph{action} step involves intervening on $\decision_\human$ and setting $\decision_\human=1$ by intervention.
%Finally in the \emph{prediction} step we estimate $\outcome$:
%\begin{eqnarray*}
%E_{\decision_\human \leftarrow 1}(\outcome|\leniencyValue_\human,\decision_\human=0,\obsFeaturesValue)%&=&    %\int   f(T=1,x,Z=\epsilon_z,\epsilon_Y) \\
%%&& P(Z=\epsilon_Z|R=\epsilon_R, T=0, x)
%% P(\epsilon_Y) d\epsilon_Z d\epsilon_Y \\
% &=&    \int   f(\decision_\human=1,\obsFeaturesValue,\unobservableValue,\epsilon_\outcome)   \prob{\epsilon_\unobservable |\leniencyValue_\human,\decision_\human=0,\obsFeaturesValue}
%\prob{\epsilon_\outcome}  d\epsilon_{\unobservable} \diff{\epsilon_\outcome}\\
% &=&    \int   \prob{\outcome=1|\decision_\human=1,\obsFeaturesValue,\unobservableValue}  \prob{\unobservableValue|\leniencyValue_\human, \decision_\human=0,\obsFeaturesValue} \diff{\unobservableValue}
%\end{eqnarray*}
%This gives us the counterfactual expectation of $Y$ for a single subject.
%
%\note{Riku}{Do we actually know the  true value of $\epsilon_{\leniency_\human}$? What we do know is some \emph{observed} leniency / acceptance rate. In stan modelling I model the leniency (\texttt{alpha\_T} in code) to obtain the correct value for \unobservableValue. But intuitively I think $\epsilon_{\leniency_\human}$ should drop out in some phase anyway. One could follow the above derivation but just augment it with $\epsilon_{\leniency_\human}$ so then
%\begin{eqnarray*}
%E_{\decision_\human \leftarrow 1}(\outcome|\leniencyValue_\human,\decision_\human=0,\obsFeaturesValue)%&=&    %\int   f(T=1,x,Z=\epsilon_z,\epsilon_Y) \\
%%&& P(Z=\epsilon_Z|R=\epsilon_R, T=0, x)
%% P(\epsilon_Y) d\epsilon_Z d\epsilon_Y \\
% &=&    \int   f(\decision_\human=1,\obsFeaturesValue,\unobservableValue,\epsilon_\outcome)  \prob{\epsilon_\outcome} \prob{\epsilon_\unobservable |\epsilon_{\leniency_\human},\decision_\human=0,\obsFeaturesValue} \prob{\epsilon_{\leniency_\human}}
%  \diff{\epsilon_{\leniency_\human}} \diff{\epsilon_{\unobservable}} \diff{\epsilon_\outcome}\\
% &=&    \int   \prob{\outcome=1|\decision_\human=1,\obsFeaturesValue,\unobservableValue}  \prob{\unobservableValue| \epsilon_{\leniency_\human}, \decision_\human=0,\obsFeaturesValue} \prob{\epsilon_{\leniency_\human}} \diff{\epsilon_{\leniency_\human}} \diff{\unobservableValue}.
%\end{eqnarray*}
%
%But this leaves us with $\epsilon_{\leniency_\human}$ in the equation.
%
%}


\section{On the Priors} \label{sec:model_definition}\label{sec:priors}

%\iffalse
%\note{Riku}{Copied from sec 3.5}
%
%The causal diagram of Figure~\ref{fig:causalmodel} provides the structure of causal relationships for quantities of interest.
%%
%In addition, we consider \judgeAmount instances $\{\human_j, j = 1, 2, \ldots, \judgeAmount\}$ of decision makers \human.
%%
%For the purposes of Bayesian modelling, we present the hierarchical model and explicate our assumptions about the relationships and the quantities below.
%%
%Note that index $j$ refers to decision maker $\human_j$ and \invlogit is the standard logistic function.
%
%\noindent
%\hrulefill
%\begin{align}
%\prob{\unobservable = \unobservableValue} & = (2\pi)^{-\nicefrac{1}{2}}\exp(-\unobservableValue^2/2)  \nonumber \\
%\prob{\decision = 0~|~\leniency_j = \leniencyValue, \obsFeatures = \obsFeaturesValue, \unobservable = \unobservableValue} & = \invlogit(\alpha_j + \gamma_\obsFeaturesValue\obsFeaturesValue + \gamma_\unobservableValue \unobservableValue + \epsilon_\decisionValue),  \label{eq:judgemodel} \\
%	\text{where}~ \alpha_{j} & \approx \logit(\leniencyValue_j) \label{eq:leniencymodel}\\
%\prob{\outcome=0~|~\decision, \obsFeatures=\obsFeaturesValue, \unobservable=\unobservableValue} & =
%	\begin{cases}
%		0,~\text{if}~\decision = 0\\
%		\invlogit(\alpha_\outcomeValue + \beta_\obsFeaturesValue \obsFeaturesValue + \beta_\unobservableValue \unobservableValue + \epsilon_\outcomeValue),~\text{o/w} \label{eq:defendantmodel}
%	\end{cases}
%\end{align}
%\hrulefill
%
%
%As stated in the equations above, we consider normalized features \obsFeatures and \unobservable.
%%
%Moreover, the probability that the decision maker makes a positive decision takes the form of a logistic function (Equation~\ref{eq:judgemodel}).
%% 
%Note that we are making the simplifying assumption that coefficients $\gamma$ are the same for all defendants, but decision makers are allowed to differ in intercept $\alpha_j \approx \logit(\leniencyValue_j)$ so as to model varying leniency levels among them (Eq. \ref{eq:leniencymodel}).
%%
%The probability that the outcome is successful conditional on a positive decision (Eq.~\ref{eq:defendantmodel}) is also provided by a logistic function, applied on the same features as the logistic formula of equation \ref{eq:judgemodel}.
%%
%In general, these two logistic functions may differ in their coefficients.
%%
%However, in many settings, a decision maker would be considered good if the two functions were the same -- i.e., if the probability to make a positive decision was the same as the probability to obtain a successful outcome after a positive decision.
%
%\fi

The priors in the Bayesian model for the coefficients $\gamma_\obsFeatures, ~\beta_\obsFeatures, ~\gamma_\unobservable$ and $\beta_\unobservable$ were defined using the gamma-mixture representation of Student's t-distribution with $\nu=6$ degrees of freedom.
%
The gamma-mixture is obtained by first sampling a variance parameter from Gamma($\nicefrac{\nu}{2},~\nicefrac{\nu}{2}$) distribution.
%
Then the coefficient is drawn from zero-mean Gaussian distribution with variance equal to the inverse of the sampled variance parameter.
%
The scale parameters $\eta_\unobservable, ~\eta_{\beta_\obsFeatures}$ and $\eta_{\gamma_\obsFeatures}$ were sampled independently from Gamma$(\nicefrac{6}{2},~\nicefrac{6}{2})$ and then the coefficients were sampled from Gaussian distribution with expectation $0$ and variance $\eta_\unobservable^{-1}, ~\eta_{\beta_\obsFeatures}^{-1}$ and $\eta_{\gamma_\obsFeatures}^{-1}$ as shown below. 
%
For vector-valued \obsFeatures, the components of $\gamma_\obsFeatures$ ($\beta_\obsFeatures$) were sampled independently with a joint precision parameter $\eta_{\gamma_\obsFeatures}$ ($\beta_{\gamma_\obsFeatures}$).
%
The coefficients for the unobserved confounder \unobservable were bounded to the positive values to ensure identifiability.
\begin{align}
\eta_\unobservable, ~\eta_{\beta_\obsFeatures}, ~\eta_{\gamma_\obsFeatures} & \sim \text{Gamma}(3, 3) \nonumber\\
\gamma_\unobservable, ~\beta_\unobservable \sim N_+(0, \eta_\unobservable^{-1}),\quad
\gamma_\obsFeatures & \sim N(0, \eta_{\gamma_\obsFeatures}^{-1}),\quad
\beta_\obsFeatures \sim N(0, \eta_{\beta_\obsFeatures}^{-1})\nonumber
\end{align}

The intercepts for the %\judgeAmount 
decision makers in the data and outcome \outcome had hierarchical Gaussian priors with variances $\sigma_\decision^2$ and $\sigma_\outcome^2$. The decision makers had a joint variance parameter $\sigma_\decision^2$.
\begin{align}
\sigma_\decision^2, ~\sigma_\outcome^2 \sim N_+(0, \tau^2),\quad
\alpha_\judgeValue \sim N(0, \sigma_\decision^2),\quad
\alpha_\outcome \sim N(0, \sigma_\outcome^2)
\end{align}
%
The variance parameters $\sigma_\decision^2$ and $\sigma_\outcome^2$ were drawn independently from bounded zero-mean Gaussian distributions which were restricted to the positive real axis and had mean $0$ and variance $\tau^2=1$.

%\hide{
%The sampler diagnostics exhibited poor performance only with XXX decider having E-BFMI value constantly below the nominal threshold of 0.2. Having a low value of E-BFMI with the sampler implies that the posterior may not have been explored fully.
%}

\iffalse

\section{Independent decision maker}\label{sec:independent}

%TAKING OUT THIS COMMENT, MICHAEL(?) HAS ADDED THE POINT IN RELATED WORK
%\rcomment{Similar decision maker has been proposed by \citet{kleinberg2018human}, see p. 256. They formalize the decision making threshold for the decision maker as a trade-off point between the costs of incarceration and committing a crime (as evaluated by that judge).}

In section \ref{sec:decisionmakers} we formulated an {\it independent} decision maker. 
%
Here we motivate it.
%
The independent decision maker stems from the notion that an experienced decision maker has made decisions on numerous subjects in their past.
%
Thus the decision maker has a good idea of the absolute dangerousness of a subject and can simply decide to make a negative decision if the probability of a subject for a negative result is too high.
%
This threshold in itself implies a level of leniency for the decision maker: if the threshold is low, the decision maker is likely to be lenient given that the subjects are randomly assigned.
%
For the purposes of studying the effect of differing levels of leniency of the decision makers we do the required derivations below to show that the threshold is equivalent to a level of leniency. 

The independent decision maker makes its decisions independently for each subject based on the features \obsFeatures and \unobservable which are standard Gaussian random variables. 
%
For a subject, we first generate their features and then assign them to a decision maker with some leniency $\leniencyValue'$.
%
The decision is assigned deterministically based on the features:
\begin{equation} \label{eq:Tdet}
  \decision=\begin{cases}
    0, & \text{if } \prob{\outcome=0|~\obsFeatures= \obsFeaturesValue, \unobservable= \unobservableValue} \geq F^{-1}(\leniencyValue')\\
    1, & \text{otherwise}.
  \end{cases}
\end{equation}
%
In the above equation, $\prob{\outcome=0|~\obsFeatures= \obsFeaturesValue, \unobservable= \unobservableValue}$ is the predicted probability of a negative outcome given the features and it is predicted by the judge.
%
The prediction is computed with equation \ref{eq:judgemodel} and it assumes that the judge is nearly perfect, i.e. that $\gamma_\unobservable \approx \beta_\unobservable$ and $\gamma_\obsFeatures \approx \beta_\obsFeatures$. 

We note that the right hand side of equation \ref{eq:defendantmodel} defines a random variable when the values of \obsFeatures and \unobservable are not known. 
%
The random variable is a logistic transformation of the sum of two Gaussian random variables and hence follows a \emph{logit-normal distribution}.
%
The inverse cumulative distribution function $F^{-1}(\leniencyValue')$ in equation \ref{eq:Tdet} is then the inverse cumulative distribution of logit-normal distribution with mean $\mu=0$ and variance $s^2=\beta_\obsFeatures^2 + \beta_\unobservable^2$. 
%
The inverse cdf
\begin{equation*} %\label{eq:cum_inv}
F^{-1}(\leniencyValue') =\sigma\left(\text{erf}^{-1}(2\leniencyValue'-1)\sqrt{2s^2}+\mu\right),
\end{equation*}
where erf is the error function. 
%
After assigning the decisions with this method, the observed leniency \leniencyValue of a decision maker was computed from the sample of all the observations assigned to that judge.

\fi

%\begin{figure}%
%    \centering
%    \subfloat[~]{{\includegraphics[width=0.5\linewidth]{./img/prior_posterior_gamma_z} }}
%    ~
%    \subfloat[~]{{\includegraphics[width=0.5\linewidth]{./img/prior_posterior_beta_z} }}
%    \caption{Prior and posterior densities for $\gamma_\unobservableValue$ (a) and $\beta_\unobservableValue$ (b). Prior density (red line) is Student's {\it t}-distribution with $6$ degrees of freedom. The estimated posterior density is shown in blue and the true value of $1$ is marked with a black dashed line.}
%    \label{fig:prior_posterior}%
%\end{figure}


%NOT GOING TO BE IN THE FINAL
%\begin{figure}
%\begin{center}\includegraphics[width=0.5\linewidth]{img/decisions_ZvsX}
%\end{center}
%\caption{This is only one judge (batch decision maker with an error term $\epsilon_\decisionValue$) with leniency 0.5. 150 out 300 have $T=1$.  56 have $ T=1,Y=0$, 94 have $T=1,Y=1$. 157 had $Y=0$ for the subjects before censoring with decision.}

%\label{fig:}
%\end{figure}

%moved to the main paper
%\begin{figure}
%\begin{center}\includegraphics[width=0.5\linewidth]{img/sl_errors_betaZ5}
%\end{center}
%\caption{Summarization of figure \ref{fig:betaZ5} in the appendix.}

%\label{fig:}
%\end{figure}

\iffalse

\clearpage

\acomment{The remaining will not appear on the final paper.}

\section{Older Related Work}


%
Although contraction is computationally very simple/efficient and estimates the true failure rate well, it has some limitations.
%
First and perhaps most significant limitation of the algorithm is that it can only be applied in situations where there is data available from multiple decision makers.
%
Secondly, the performance of contraction depends on three quantities: the leniency of the most lenient decision maker, the number of decisions given by the most lenient decision maker and agreement rate. 
%
Agreement rate describes the rate of agreement between decision maker \machine and \human, the higher the agremeent rate, the better contraction performs.
%
Thirdly, contraction is only suitable for evaluating binary outcomes, whereas the counterfactual approach is readily extendable to accommodate real-valued outcomes.
%
In addition contraction can only estimate the true failure rate only up to the leniency of the most lenient decision maker, so if in some high stakes application areas, the greatest acceptance rate is only 50\%, the performance of a machine can only be evaluated up to leniency $0.5$.

\note{Riku}{How detailed must/should the description of contraction be? Limitation list is also quite long...}


\note{Michael}{It is not clear to me if we'll have a separate section and where.
I think the safest option is to place a Related Work section immediately after the introduction.
The disadvantage of that is that it may delay the presentation of the main contributions.
On the other hand, we should make sure that competing methods like \citet{lakkaraju2017selective} are sufficiently described before the appear in experiments.
}

Discuss this: 

\begin{itemize}
\item Lakkaraju and contraction. \cite{lakkaraju2017selective}
	\item Contraction
		\begin{itemize}
		\item Algorithm by Lakkaraju et al. Assumes that the subjects are assigned to the judges at random and requires that the judges differ in leniency. 
		\item Can estimate the true failure only up to the leniency of the most lenient decision maker.
		\item Performance is affected by the number of people judged by the most lenient decision maker, the agreement rate and the leniency of the most lenient decision maker. (Performance is guaranteed / better when ...)
		\item Works only on binary outcomes
		\item (We show that our method isn't constrained by any of these)
		\item The algorithm goes as follows...
%\begin{algorithm}[] 			% enter the algorithm environment
%\caption{Contraction algorithm \cite{lakkaraju17}} 		% give the algorithm a caption
%\label{alg:contraction} 			% and a label for \ref{} commands later in the document
%\begin{algorithmic}[1] 		% enter the algorithmic environment
%\REQUIRE Labeled test data $\D$ with probabilities $\s$ and \emph{missing outcome labels} for observations with $T=0$, acceptance rate r
%\ENSURE
%\STATE Let $q$ be the decision maker with highest acceptance rate in $\D$.
%\STATE $\D_q = \{(x, j, t, y) \in \D|j=q\}$
%\STATE \hskip3.0em $\rhd$ $\D_q$ is the set of all observations judged by $q$
%\STATE
%\STATE $\RR_q = \{(x, j, t, y) \in \D_q|t=1\}$
%\STATE \hskip3.0em $\rhd$ $\RR_q$ is the set of observations in $\D_q$ with observed outcome labels
%\STATE
%\STATE Sort observations in $\RR_q$ in descending order of confidence scores $\s$ and assign to $\RR_q^{sort}$.
%\STATE \hskip3.0em $\rhd$ Observations deemed as high risk by the black-box model $\mathcal{B}$ are at the top of this list
%\STATE
%\STATE Remove the top $[(1.0-r)|\D_q |]-[|\D_q |-|\RR_q |]$ observations of $\RR_q^{sort}$ and call this list $\mathcal{R_B}$
%\STATE \hskip3.0em $\rhd$ $\mathcal{R_B}$ is the list of observations assigned to $t = 1$ by $\mathcal{B}$
%\STATE
%\STATE Compute $\mathbf{u}=\sum_{i=1}^{|\mathcal{R_B}|} \dfrac{\delta\{y_i=0\}}{| \D_q |}$.
%\RETURN $\mathbf{u}$
%\end{algorithmic}
%\end{algorithm}


		\end{itemize}
\item Counterfactuals/Potential outcomes. \cite{pearl2010introduction} (also Rubin)
\item Approach of Jung et al for optimal policy construction. \cite{jung2018algorithmic}
	\begin{itemize}
	\item Task: They study unobserved confounding in the context of creating optimal decision policies. (Mentioned in the intro)
	\item Contributions: (1) a Bayesian model to evaluate decision algorithms in the presence of unmeasured confounding. (2) they show policy evaluation problem they consider is a generalization of estimating heterogeneous treatment effects in observational studies. (3) they  show that one can construct near-optimal decision algorithms even if there is unmeasured confounding.
	\item In contrast: They consider a 'bail or no-bail' scenario and construct a trade-off curve of proportion of defandants failing to appear in their trial vs. proportion released without bail.
	\item They approached the problem with Bayesian modelling, but they don't consider the selective labels issue where decisions can deterministically define the outcome. (in intro)
	\item Additionally they don't consider the effect of having multiple decision makers with differing levels of leniency. (in intro) \acomment{\cite{Jung2} does? Or what are the groups?} \rcomment{In jungs paper, they ''divide the data into K approximately equally sized groups, ranking and binning by the estimated outcome $\hat{\mu}_0$''. There $\hat{\mu}_0$ is a regression model predicting the outcome Y given the decision is $T=0$. So their grouping is not based on the judge identities (in fact, they don't utilize them at all).}
	\end{itemize}
\item Discussions of latent confounders in multiple contexts.
%\item Classical Bayesian sensitivity analysis of \citet{mccandless2007bayesian}
%	\begin{itemize}
%	\item Task: Bayesian sensitivity analysis of the effect of an unmeasured binary confounder on a binary response with a binary exposure variable and other masured confounders.
%	\item Experiments: The writers consider the effect of different priors on the coefficient estimates logistic regression in a beta blocker therapy study.
%	\item The authors carry out a more classical analysis of the effect of priors on the estimates. There are similarities, but there are also a lot of differences, most notably lack of selective labeling and a different model structure where the observed independent variables affect both the unobserved confounder and the result. In their model the unobserved only affects the outcome.
%	\end{itemize}
\item Imputation methods and other approaches to selective labels
%\item Data augmentation approach by \citet{dearteaga2018learning}
%	\begin{itemize}
%	\item Task: Training predictive models to perform better under selective labeling utilizing the homogeneity of human decision makers. They base their approach on the notion that if decision makers consistently make a negative decision to some subjects they must be dangerous.
%	\item Contributions: They propose a method for augmenting the selectively labeled data with observations that have a selection probability under some threshold $\epsilon$. I.e. For observations with $\decision=0$, predict $\prob{\decision~|~\obsFeatures}$, augment data so that $\outcome = 0$ when $\prob{\decision~|~\obsFeatures} < \epsilon$ instead of having missing values.
%	\item In contrast: The writers assume no unobservable confounders affecting the outcome and focus only on the similarity of the assigned decisions given the features. Writers do not address the issue of leniency in their analysis.
%	\end{itemize}
\item Doubly robust methods, propensity score and other matching techniques
\end{itemize}


\section{Additional figures}


\begin{figure}[!b]
\centering
\subfloat[12 decision makers.]{\includegraphics[width = \linewidth]{./img/sl_compas_nJudges12_all}}\\
\subfloat[24 decision makers.]{\includegraphics[width = \linewidth]{./img/sl_compas_nJudges24_all}}\\
\subfloat[48 decision makers.]{\includegraphics[width = \linewidth]{./img/sl_compas_nJudges48_all}}

\caption{Results of experiments with COMPAS data using different number of judges. }
%\label{fig:}
\end{figure}

These figures also feature the \textbf{Probabilistic} decision maker: Each subject is released with probability based on the logistic regression model, where the leniency is inputted through $\alpha_j$. 
%\rcomment{Hard to justify any more? Or this decision maker could now be described as follows: Each subject is released with probability equal to some risk score which differs based on the assigned judge. In the experiments, the risk scores were  computed with equation \ref{eq:judgemodel} where leniency was inputted through $\alpha_j$.}

%\begin{figure}
%\includegraphics[width=\linewidth]{img/leniency_figure}
%\caption{Figure illustrating the relationship of \leniency, \obsFeatures and \unobservable. Points $A$, $B$, $C$ and $D$ represent four subjects each with different features \obsFeatures and \unobservable.
%%
%Lines $\leniencyValue_1$ and $\leniencyValue_2$ show decision boundaries for decision makers with different leniencies.
%%
%Figure shows how while sharing features \obsFeatures subjects $A$ and $C$ receive different decisions from decision maker $1$ but not from decision maker $2$ due to difference in \unobservable.
%%
%The figure also explicates the interplay of features \obsFeatures and \unobservable. Considering subjects $A$ and $D$, one might claim $D$ to be more dangerous than subject $A$ based on features \obsFeatures alone. However, assuming that the decision maker $2$ uses feature \unobservable efficiently, they will keep the decision the same as they observe reduction in \unobservable.}
%\label{fig:approach}
%\end{figure}

\rcomment{I just wanted to point out the following thought, which might have an error due to confounding in the model: Even though we do not discuss fairness or discrimination explicitly in our research, our presented method should correct for inherent biases of the decision makers in the data. This happens because we model the probabilities for the outcomes and decisions independently. Thus, if there is a component in \obsFeatures which affects the corresponding component of coefficient $\beta_\obsFeatures$ in a non-indicative way of the outcome \outcome, we notice and correct for the error and then our model should reveal that for that component $\beta_\obsFeatures \neq \gamma_\obsFeatures$. So even though we rely on the judge to convey information on \unobservable we don't do  that for \obsFeatures.

For example, if the judges in the data make biased decisions based on gender, our model shows that $\beta_\text{gender} \neq  \gamma_\text{gender}$ because there is differece in the way how gender affects the outcome and the decision. Including the \unobservable should correct for any backdoor confounding.}

\begin{figure*}%[H]
\centering
\subfloat[Random H, Random M]{\includegraphics[width = 3in]{./img/_deciderH_random_deciderM_random_maxR_0_9coefZ1_0_all}} ~
\subfloat[Random H, batch M]{\includegraphics[width = 3in]{./img/_deciderH_random_deciderM_batch_maxR_0_9coefZ1_0_all}}\\
\subfloat[Batch H, Random M]{\includegraphics[width = 3in]{./img/_deciderH_batch_deciderM_random_maxR_0_9coefZ1_0_all}}~
\subfloat[Batch H, Batch M]{\includegraphics[width = 3in]{./img/_deciderH_batch_deciderM_batch_maxR_0_9coefZ1_0_all}} \\
\subfloat[Independent H and Random M]{\includegraphics[width = 3in]{./img/_deciderH_independent_deciderM_random_maxR_0_9coefZ1_0_all}} ~
\subfloat[Independent H, Batch M]{\includegraphics[width = 3in]{./img/_deciderH_independent_deciderM_batch_maxR_0_9coefZ1_0_all}}\\
\subfloat[Probabilistic H, Random M]{\includegraphics[width = 3in, height = 1.5in]{./img/_deciderH_probabilistic_deciderM_random_maxR_0_9coefZ1_0_all}}~
\subfloat[Probabilistic H, Batch M]{\includegraphics[width = 3in, height = 1.5in]{./img/_deciderH_probabilistic_deciderM_batch_maxR_0_9coefZ1_0_all}}\\

\caption{Figures with different deciders (N=5k, 50 judges, max$(r)=0.9,~ \beta_z=\gamma_z=1$).}
\label{some example}
\end{figure*}

%%

\begin{figure*}%[H]
\centering
\subfloat[Random H, Random M]{\includegraphics[width = 3in]{./img/_deciderH_random_deciderM_random_maxR_0_5coefZ1_0_all}} ~
\subfloat[Random H, batch M]{\includegraphics[width = 3in]{./img/_deciderH_random_deciderM_batch_maxR_0_5coefZ1_0_all}}\\
\subfloat[Batch H, Random M]{\includegraphics[width = 3in]{./img/_deciderH_batch_deciderM_random_maxR_0_5coefZ1_0_all}}~
\subfloat[Batch H, Batch M]{\includegraphics[width = 3in]{./img/_deciderH_batch_deciderM_batch_maxR_0_5coefZ1_0_all}} \\
\subfloat[Independent H and Random M]{\includegraphics[width = 3in]{./img/_deciderH_independent_deciderM_random_maxR_0_5coefZ1_0_all}} ~
\subfloat[Independent H, Batch M]{\includegraphics[width = 3in]{./img/_deciderH_independent_deciderM_batch_maxR_0_5coefZ1_0_all}}\\
\subfloat[Probabilistic H, Random M]{\includegraphics[width = 3in, height = 1.5in]{./img/_deciderH_probabilistic_deciderM_random_maxR_0_5coefZ1_0_all}}~
\subfloat[Probabilistic H, Batch M]{\includegraphics[width = 3in, height = 1.5in]{./img/_deciderH_probabilistic_deciderM_batch_maxR_0_5coefZ1_0_all}}\\

\caption{Figures with different deciders (N=5k, 50 judges, max$(r)=0.5,~ \beta_z=\gamma_z=1$).}
\label{some example}
\end{figure*}

%%

\begin{figure*}%[H]
\centering
\subfloat[Random H, Random M]{\includegraphics[width = 3in]{./img/_deciderH_random_deciderM_random_maxR_0_9coefZ5_0_all}} ~
\subfloat[Random H, batch M]{\includegraphics[width = 3in]{./img/_deciderH_random_deciderM_batch_maxR_0_9coefZ5_0_all}}\\
\subfloat[Batch H, Random M]{\includegraphics[width = 3in]{./img/_deciderH_batch_deciderM_random_maxR_0_9coefZ5_0_all}}~
\subfloat[Batch H, Batch M]{\includegraphics[width = 3in]{./img/_deciderH_batch_deciderM_batch_maxR_0_9coefZ5_0_all}} \\
\subfloat[Independent H and Random M]{\includegraphics[width = 3in]{./img/_deciderH_independent_deciderM_random_maxR_0_9coefZ5_0_all}} ~
\subfloat[Independent H, Batch M]{\includegraphics[width = 3in]{./img/_deciderH_independent_deciderM_batch_maxR_0_9coefZ5_0_all}}\\
\subfloat[Probabilistic H, Random M]{\includegraphics[width = 3in]{./img/_deciderH_probabilistic_deciderM_random_maxR_0_9coefZ5_0_all}}~
\subfloat[Probabilistic H, Batch M]{\includegraphics[width = 3in, height = 1.5in]{./img/_deciderH_probabilistic_deciderM_batch_maxR_0_9coefZ5_0_all}}\\

\caption{Figures with different deciders (N=5k, 50 judges, max$(r)=0.9, \beta_z=\gamma_z=5$).}
\label{fig:betaZ5}
\end{figure*}

%%%

\begin{figure*}%[H]
\centering
\subfloat[Random M, $\max(r)=0.9, \beta_z=\gamma_z=1$]{\includegraphics[width = 3in]{./img/with_epsilon_deciderH_independent_deciderM_random_maxR_0_9coefZ1_0_all}} ~
\subfloat[Batch M, $\max(r)=0.9, \beta_z=\gamma_z=1$]{\includegraphics[width = 3in]{./img/with_epsilon_deciderH_independent_deciderM_batch_maxR_0_9coefZ1_0_all}} \\
\subfloat[Random M, $\max(r)=0.5, \beta_z=\gamma_z=1$]{\includegraphics[width = 3in]{./img/with_epsilon_deciderH_independent_deciderM_random_maxR_0_5coefZ1_0_all}} ~
\subfloat[Batch M, $\max(r)=0.5, \beta_z=\gamma_z=1$]{\includegraphics[width = 3in]{./img/with_epsilon_deciderH_independent_deciderM_batch_maxR_0_5coefZ1_0_all}} \\
\subfloat[Random M, $\max(r)=0.9, \beta_z=\gamma_z=5$]{\includegraphics[width = 3in]{./img/with_epsilon_deciderH_independent_deciderM_random_maxR_0_9coefZ5_0_all}} ~
\subfloat[Batch M, $\max(r)=0.9, \beta_z=\gamma_z=5$]{\includegraphics[width = 3in]{./img/with_epsilon_deciderH_independent_deciderM_batch_maxR_0_9coefZ5_0_all}} \\
\caption{Figures with an independent decider H (with error term in decisions) and logistic regression imputation (N=5k, 50 judges). The curves for logistic regression imputation and labeled outcomes overlap heavily in subfigure e. In logistic regression evaluation we impute all the missing outcomes in the test data and the deploy true evaluation on the imputed data. The data is imputed using a regression model built on subjects with observed outcomes in the test data.}
%\label{}
\end{figure*}


%%%
%
%\begin{figure}[H]
%\subfloat[Random H, Random M]{\includegraphics[width = 3in]{./img/random_H_random_M}} ~
%\subfloat[Random H, batch M]{\includegraphics[width = 3in]{./img/random_H_batch_M}}\\
%\subfloat[Batch H, Random M]{\includegraphics[width = 3in, height = 1.5in]{example-image}}~
%\subfloat[Batch H, Batch M]{\includegraphics[width = 3in]{./img/lakkarajus_H_batch_M}} \\
%\subfloat[Independent H and Random M]{\includegraphics[width = 3in, height = 1.5in]{example-image}} ~
%\subfloat[Independent H, Batch M]{\includegraphics[width = 3in]{./img/independent_H_batch_M}}\\
%\subfloat[Probabilistic H, Random M]{\includegraphics[width = 3in]{./img/probabilistic_H_random_M}}~
%\subfloat[Probabilistic H, Batch M]{\includegraphics[width = 3in]{./img/probabilistic_H_batch_M}}\\
%
%\caption{Figures with different configs.}
%\label{some example}
%\end{figure}
%
%%%
%
%
%\begin{figure}[H]
%\subfloat[Probabilistic H, batch M, $\beta_Z=\gamma_Z=5$]{\includegraphics[width = 3in]{./img/probabilistic_H_batch_M_coef_betaZ_5}} ~
%\subfloat[Probabilistic H, batch M, max$(r)=0.5$.]{\includegraphics[width = 3in]{./img/sl_rmax05}}
%\caption{Figures with different configs. (cont.)}
%\label{some example}
%\end{figure}

%\newpage

\fi