imputation.tex

\section{Counterfactual-Based Imputation For Selective Labels}


\begin{figure*}
\begin{center}
\includegraphics[height=2in]{img/setting}
\end{center}
\caption{Setting. Negative decisions by decision maker $M$ ($T_{_M} = 0$) are evaluated as successful ($Y_{_M} = 1$) (shown with dashed arrows). For negative decisions by decision maker $H$ ($T_{_H} = 0$), the outcome is evaluated according to the table of imputed outcomes (dotted arrows). Imputed outcomes are produced from the dataset outcomes by making a counterfactual prediction for those cases where $H$ had made a negative decision (solid arrows).
}
\end{figure*}

\subsection{Causal Modeling}

$X$ and $Z$ are assumed to be continuous Gaussian variables, with the interpretation that they represent aggregated risk factors such that higher values denote higher risk for a negative outcome ($Y=0$).

\subsection{Imputation}

Our approach is based on the fact that in almost all cases, some information regarding the latent variable is recoverable. For illustration, let us consider defendant $i$ who has been given a negative decision $\decisionValue_i = 0$. If the defendant's private features $\featuresValue_i$ would indicate that this subject would be safe to release, we could easily deduce that the unobservable variable $\unobservableValue_i$ indicated high risk since
%contained so significant information that 
the defendant had to be jailed. In turn, this makes $Y=0$ more likely than what would have been predicted based on $\featuresValue_i$ alone.
In the situation,  where the features $\featuresValue_i$ clearly indicate risk and the defendant is subsequently jailed, we do not have that much information available on the latent variable.

\acomment{Could emphasize the above with a plot, x and z in the axis and point styles indicating the decision.}
\acomment{The above assumes that the decision maker in the data is not totally bad.}


In counterfactual-based imputation we use counterfactual values of the outcome $\outcome_{\decisionValue=1}$ to impute the missing labels. The SCM required to compute the counterfactuals is presented in figure \ref{fig:causalmodel}. Using Stan, we model the observed data as 
% \begin{align} \label{eq:data_model}
%  \outcome ~|~\decision = 1, x & \sim \text{Bernoulli}(\invlogit(\alpha_y + \beta_{xy} x + \beta_{zy} z)) \\ \nonumber
%  \decision ~|~D, x & \sim \text{Bernoulli}(\invlogit(\alpha_{j} + \beta_{xt} x + \beta_{zt}z)). \\ \nonumber
% \end{align}


That is, we fit one logistic regression model modelling the decisions based on the observable features \features and the identity of the judge using all of the data. The identity of the judge is encoded into the intercept $\alpha_j$. (We use different intercepts for different judges.) We model the observed outcomes with $\decision = 1$ with a separate regression model to learn the parameters: coefficients $\beta_{xy}$ for the observed features, $\beta_{zy}$ for the unobserved features, the sole intercept $\alpha_y$ and the possible value for the latent variable \unobservable.

Using the samples from the posterior distribution for all the parameters given by Stan, we can estimate the values of the counterfactuals. The counterfactuals are formally drawn from the posterior predictive distribution
\[
p(\tilde{y}|y) = \int_\Omega p(\tilde{y}|\theta)(\theta|y)d\theta.
\]

In practise, once we have used Stan, we have $S$ samples from all of the parameters of the model from the posterior distribution $p(\theta|y)$ (probability of parameters given the data). Then we use those values to sample the probable outcomes for the missing values. E.g. for some observation the outcome $\outcomeValue_i$ is missing. Using Stan we obtain a sample for the coefficients, intercepts and $\unobservableValue_i$ showing their distribution. This sample includes $S$ values. Now we put these values to the model presented in the first line of equation \ref{eq:data_model}. Now, using all these parameter values we can draw counterfactual values for the outcome Y from the distribution $y_{i, \decisionValue=1}  \sim \text{Bernoulli}(\invlogit(\alpha_y + \beta_{xy} x_i + \beta_{zy} z))$. In essence, we use the sampled parameter values from the posterior to sample new values for the missing outcomes. As we have S "guesses" for each of the missing outcomes we then compute the failure rate for each set of the guesses and use the mean.


\begin{algorithm}
\DontPrintSemicolon
\KwIn{Test data set $\dataset = \{x, j, t, y\}$, acceptance rate $r$} 
\KwOut{Failure rate at acceptance rate $r$} 
Using Stan, draw $S$ samples of the all parameters from the posterior distribution defined in equation \ref{eq:data_model}. Every item of the vector \unobservableValue is treated as a parameter.\;
\For{i in $1, \ldots, S$}{
	\For{j in $1, \ldots, \datasize$}{
		Draw new outcome $\tilde{\outcome}_{j}$ from $\text{Bernoulli}(\invlogit(\alpha_{j}[i] + \beta_{xt}[i] x + \beta_{zt}[i] z[i, j])$
	}
	Impute missing values using outcomes drawn in the previous step.\;
	Sort the observations in ascending order based on the predictions of the predictive model.\;
	Estimate the FR as $\frac{1}{\datasize}\sum_{k=1}^{\datasize\cdot r} \indicator{\outcomeValue_k=0}$ and assign to $\mathcal{U}$.\;
}
\Return{Mean of $\mathcal{U}$.}
	
\caption{Counterfactual-based imputation}	\end{algorithm}