sl.tex

\documentclass[sigconf,anonymous]{acmart}
% \documentclass[sigconf]{acmart}


\usepackage{tikz}
\usepackage{tikz-cd}
\usetikzlibrary{arrows,automata, positioning}

% Packages
\usepackage{type1cm}     % type1 computer modern font
\usepackage{graphicx}     % advanced figures
\usepackage{xspace}     % fix space in macros
\usepackage{balance}     % to better equalize the last page
\usepackage{multirow}     % multi rows for tables
\usepackage[font={bf}, tableposition=top]{caption}     % captions on top for tables
\usepackage{bold-extra}     % bold + {small capital, italic}
\usepackage{siunitx}          % \num for decimal grouping
\usepackage[vlined,linesnumbered,ruled,noend]{algorithm2e}     % algorithms
\usepackage{booktabs}     % nicer tables
%\usepackage[hyphens]{url}     % handle long urls
%\usepackage[bookmarks, pdftex, colorlinks=false]{hyperref}     % clickable references
%\usepackage[square,numbers]{natbib}     % better references
\usepackage{microtype}    % compress text
\usepackage{units}     % nicer slanted fractions
\usepackage{mathtools}     % amsmath++
%\usepackage{amssymb}     % math symbols
%\usepackage{amsmath}
\usepackage{relsize}
\usepackage{caption}
\captionsetup{belowskip=6pt,aboveskip=2pt} % to save space.
%\usepackage{subcaption}
% \usepackage{multicolumn}
\usepackage[]{inputenc}
\usepackage{xfrac}
\RequirePackage{graphicx,color}
\usepackage[font={small}]{subfig} % subfig, 4 figures in a row
\usepackage{pifont}
\usepackage{footnote} % show footnotes in tables
\makesavenoteenv{table}

\newcommand{\acomment}[1]{{{\color{orange} [A: #1]}}}
\newcommand{\rcomment}[1]{{{\color{red} [R: #1]}}}
\newcommand{\mcomment}[1]{{{\color{blue} [M: #1]}}}

\newtheorem{problem}{Problem}

\newcommand{\ourtitle}{Evaluating Decision Makers over Selectively Labeled Data}

\input{macros}
\usepackage{chato-notes}


\title{\ourtitle}

\author{Michael Mathioudakis}
\affiliation{%
  \institution{University of Helsinki}
  \city{Helsinki} 
  \country{Finland} 
}
\email{michael.mathioudakis@helsinki.fi}


\begin{abstract}
Today, AI systems replace humans in an increasing number of decisions affecting people's lives.
%
Therefore, it is important to evaluate the performance of such systems {\it offline}, i.e., before they are deployed in real settings --
and compare it to the performance of human decisions they aim to replace.
%
One major challenge in such cases is that often past decisions have skewed the data on which the evaluation is performed. 
%
For example, when a bank decides whether a customer should be granted a loan, it is desired to grant loans to customers who would honor its conditions, but not to ones who would violate them.
%
However, we can directly evaluate only the decision to grant the loan, while we cannot observe whether customers who were not granted the loan would indeed violate its conditions. 
% 
Such skew appears in the decisions of both human and AI decision makers -- and should be properly taken into account for evaluation.
%
In this paper, we develop a Bayesian approach towards this end that uses counterfactual-based imputation to infer unobserved outcomes.
%
Compared to previous state-of-the-art, the quality of decisions is estimated more accurately and with lower variance. 
%
The approach is also shown to be robust to different variations in the decision mechanisms in the data.
%
\mcomment{On one hand, since we use judicial data in our experiments, it makes sense to use the bail-or-jail case in the abstract. On the other hand, this does not connect with the motivation we provide to evaluate the decision of (computer/ML/AI) systems, since jail-or-bail decisions are not currently made by such systems (risk scores are used as assisting tools). The bank loan example might look better in the abstract.}
%
\end{abstract}


\begin{document}


\fancyhead{}
\maketitle

\renewcommand{\shortauthors}{Authors}


\input{introduction}

\input{setting}

\input{imputation}


\section{Related work}

Discuss this: \cite{DBLP:conf/icml/Kusner0LS19}

\begin{itemize}
\item Lakkaraju and contraction. \cite{lakkaraju2017selective}
	\item Contraction
		\begin{itemize}
		\item Algorithm by Lakkaraju et al. Assumes that the subjects are assigned to the judges at random and requires that the judges differ in leniency. 
		\item Can estimate the true failure only up to the leniency of the most lenient decision-maker.
		\item Performance is affected by the number of people judged by the most lenient decision-maker, the agreement rate and the leniency of the most lenient decision-maker. (Performance is guaranteed / better when ...)
		\item Works only on binary outcomes
		\item (We show that our method isn't constrained by any of these)
		\item The algorithm goes as follows...
%\begin{algorithm}[] 			% enter the algorithm environment
%\caption{Contraction algorithm \cite{lakkaraju17}} 		% give the algorithm a caption
%\label{alg:contraction} 			% and a label for \ref{} commands later in the document
%\begin{algorithmic}[1] 		% enter the algorithmic environment
%\REQUIRE Labeled test data $\D$ with probabilities $\s$ and \emph{missing outcome labels} for observations with $T=0$, acceptance rate r
%\ENSURE
%\STATE Let $q$ be the decision-maker with highest acceptance rate in $\D$.
%\STATE $\D_q = \{(x, j, t, y) \in \D|j=q\}$
%\STATE \hskip3.0em $\rhd$ $\D_q$ is the set of all observations judged by $q$
%\STATE
%\STATE $\RR_q = \{(x, j, t, y) \in \D_q|t=1\}$
%\STATE \hskip3.0em $\rhd$ $\RR_q$ is the set of observations in $\D_q$ with observed outcome labels
%\STATE
%\STATE Sort observations in $\RR_q$ in descending order of confidence scores $\s$ and assign to $\RR_q^{sort}$.
%\STATE \hskip3.0em $\rhd$ Observations deemed as high risk by the black-box model $\mathcal{B}$ are at the top of this list
%\STATE
%\STATE Remove the top $[(1.0-r)|\D_q |]-[|\D_q |-|\RR_q |]$ observations of $\RR_q^{sort}$ and call this list $\mathcal{R_B}$
%\STATE \hskip3.0em $\rhd$ $\mathcal{R_B}$ is the list of observations assigned to $t = 1$ by $\mathcal{B}$
%\STATE
%\STATE Compute $\mathbf{u}=\sum_{i=1}^{|\mathcal{R_B}|} \dfrac{\delta\{y_i=0\}}{| \D_q |}$.
%\RETURN $\mathbf{u}$
%\end{algorithmic}
%\end{algorithm}
		\end{itemize}
\item Counterfactuals/Potential outcomes. \cite{pearl2010introduction} (also Rubin)
\item Approach of Jung et al for optimal policy construction. \cite{jung2018algorithmic}
\item Discussions of latent confounders in multiple contexts.
\item Imputation methods and other approaches to selective labels, eg. \cite{dearteaga2018learning}
\end{itemize}

\section{Experiments}

In this section we present our results from experiments with synthetic and realistic data. We show that our approach provides the best estimates for evaluating the performance of a predictive model on all levels of leniency.

\subsection{Synthetic data}

\rcomment{ I presume MM's preferences were that the outcome would be from Bernoulli distribution and that the decisions would be independent. So, let's first explain those ways thoroughly and then mention what we changed as discussed.}

We experimented with synthetic data sets to examine accurateness, unbiasedness and robustness to violations of the assumptions. 

We sampled $N=7k$ samples of $X$, $Z$, and $W$ as independent standard Gaussians. We then drew the outcome $Y$ from a Bernoulli distribution with parameter $p = 1 - \invlogit(\beta_xx+\beta_zz+\beta_ww)$ so that $P(Y=0|X, Z, W) =  \invlogit(\beta_xx+\beta_zz+\beta_ww)$ where the coefficients for X, Z and W were set to $1$, $1$ and $0.2$ respectively. Then the leniency levels $R$ for each of the $M=14$ judges were assigned pairwise so that each of the paiirs had leniencies $0.1,~0.2,\ldots, 0.7$. The subjects were assigned randomly to the judges so each received $500$ subjects. The data was divided in half to form a training set and a test set. This process follows the suggestion of Lakkaraju et al. \cite{lakkaraju2017selective}. \acomment{Check before?}

The \emph{default} decision maker in the data predicts a subjects' probability for recidivism to be $P(\decision = 0~|~\features, \unobservable) = \invlogit(\beta_xx+\beta_zz)$. Each of the decision-makers is assigned a leniency value, so the decision is then assigned by comparing the value of $P(\decision = 0~|~\features, \unobservable)$ to the value of the inverse cumulative density function $F^{-1}_{P(\decision = 0~|~\features, \unobservable)}(r)=F^{-1}(r)$. Now, if $F^{-1}(r) < P(\decision = 0~|~\features, \unobservable)$ the subject is given a negative decision $\decision = 0$ and a positive otherwise. \rcomment{Needs double checking.} This ensures that the decisions are independent and that the ratio of positive decisions to negative decisions converges to $r$. Then the outcomes for which the decision was negative, were set to $0$.
 
We used a number of different decision mechanisms. A \emph{limited} decision-maker works as the default, but predicts the risk for a negative outcome using only the recorded features \features so that $P(\decision = 0~|~\features, \unobservable) = \invlogit(\beta_xx)$. Hence it is unable to observe $Z$.  A \emph{biased} decision maker works similarly as the default decision-maker but the values for the observed features \features observed by the decision-maker are altered. We modified the values so that if the value for \featuresValue  was greater than $1$ it was multiplied by $0.75$ to induce more positive decisions. Similarly, if the subject's \featuresValue was in the interval $(-2,~-1)$ we added $0.5$ to induce more negative decisions. Additionally the effect of non-informative decisions were investigated by deploying a \emph{random} decision-maker. Given leniency $R$, a random decision-maker give a positive decision $T=1$ with probability given by $R$.

In contrast, Lakkaraju et al. essentially order the subjects and decide $T=1$ with the percentage given by the leniency $R$. We see this as unrealistic: the decisions 
on a subject should not depend on the decision on other subject. In the example this would induce unethical behaviour: a single judge would need to jail defendant today in order to release a defendant tomorrow.
We treat the observations as independent and the still the leniency would be a good estimate of the acceptance rate. The acceptance rate converges to the leniency. 

\paragraph{Evaluators} 
	We deployed multiple evaluator modules to estimate the true failure rate of the decider module. The estimates should be close to the true evaluation evaluator modules estimates and the estimates will eventually be compared to the human evaluation curve. 
\begin{itemize}
	\item  \emph{True evaluation:} True evaluation depicts the true performance of a model. The estimate is computed by first sorting the subjects into a descending order based on the prediction of the model. Then the true failure rate estimate is computable directly from the outcome labels of the top $1-r\%$ of the subjects. True evaluation can only be computed on synthetic data sets as the ground truth labels are missing.
	%\item \emph{Human evaluation:} Human evaluation presents the performance of the decision-makers who observe the latent variable. Human evaluation curve is computed by binning the decision-makers with similar values of leniency into bins and then computing their failure rate from the ground truth labels. \rcomment{Not computing now.}
	\item \emph{Labeled outcomes:} Labeled outcomes algorithm is the conventional method of computing the failure rate. We proceed as in the true evaluation method but use only the available outcome labels to estimate the failure rate.
	\item \emph{Contraction:} Contraction is an algorithm designed specifically to estimate the failure rate of a black-box predictive model under selective labeling. See previous section.
\end{itemize}

\paragraph{Results} We deployed the evaluators on the synthetic data set presented and the results are in Figure \ref{fig:results_main, fig:results_main_2}. The new presented method can recover the true performance of a model for all levels of leniency. In the figure we see how contraction algorithm can only estimate the true performance up to the level of the most lenient decision-maker when the proposed method can do that for arbitrary levels of leniency. To create comparable results we also employed the batch decision-making mechanism presented by Lakkaraju et al. to a synthetic data set which had $N=9k$ instaces and $M=14$ judges with leniencies $0.1, \ldots, 0.9$. The mean absolute error of contraction ($0.00265$) was approximately $90\%$ higher than the performance of the presented method ($0.00139$).

Siimilar results were obtained from experiments with the aforementioned \emph{limited} \rcomment{Not done yet.}, \emph{biased} and \emph{random} deciders. The mean absolute errors for the estimates / other results are presented in .... This shows  that the counterfactual-based imputation method is robust to changes in the data generating mechanisms and therefore will accompany multiple scenarios. The results from experiments with the biased decision-makers show that the proposed method can preform well despite the  biased decisions of the current decisino-makers. This is important because...
	
\begin{figure}
%\centering
\includegraphics[width=\linewidth]{./img/sl_results_independent_decisions}
\caption{Failure rate vs Acceptance rate with independent decisions -- comparison of the methods, error bars denote standard deviation of the estimate. Here we can see that the new proposed method (red) can recover the true failure rate more accurately than the contraction algorithm (blue). In addition, the new method can accurately track the \emph{true evaluation} curve (green) for all levels of leniency regardless of the leniency of the most lenient decision maker.}
\label{fig:results_main}
\end{figure}

\begin{figure}
%\centering
\includegraphics[width=\linewidth]{./img/sl_results_batch_decisions}
\caption{Failure rate vs Acceptance rate with batch decisions -- comparison of the methods, error bars denote standard deviation of the estimate. Here we can see that the new proposed method (red) can recover the true failure rate more accurately than the contraction algorithm (blue). In addition, the new method can accurately track the \emph{true evaluation} curve for all levels of leniency regardless of the leniency of the most lenient decision maker.}
\label{fig:results_main_2}
\end{figure}

\begin{figure}
%\centering
\includegraphics[width=\linewidth]{./img/sl_results_batch_decisions_error_figure}
\caption{Error w.r.t. True evaluation vs Acceptance rate, error bars denote standard deviations. }
\label{fig:results_main_err}
\end{figure}

\subsection{Realistic data}
In this section we present results from experiments with (realistic) data sets. 

\subsubsection{Analysis on COMPAS data}

COMPAS (Correctional Offender Management Profiling for Alternative Sanctions) is Northpointe's (now under different name) tool for guiding decisions in the criminal justice system. COMPAS tool provides judges with risk estimates regarding the probability of recidivism and failure to appear. The COMPAS score is mainly derived from "prior criminal history, criminal associates, drug involvement, and early indicators of juvenile delinquency problems" and it predicts recidivism in the following two years. The sole use of the COMPAS score as a basis for judgement has been denied by law, judges must base their decisions to other factors. 

The COMPAS data set is recidivism data from Broward county, California, USA. The data set was preprocessed by ProPublica for their article Machine Bias. The original data contained information about $18 610$ defendants who were given a COMPAS score during 2013 or 2014. After removing defendants who were not preprocessed at pretrial stage $11 757$ defendants were left. Additionally, defendants for whom the COMPAS score couldn't be matched with a corresponding charge were removed from analysis resulting in a data set consisting of $7 214$ observations. Following ProPublica's reasoning, after final data cleaning we were left with $6 172$ offences. Data includes the subjects' demographic information such as gender, age, race and information on their previous offences.

For the analysis, we created synthetic judges with leniencies $0.1, 0.2, \ldots, 0.9$. All the subjects were distributed to the judges as evenly as possible and at random. In this semi-synthetic scenario, the judge would base their decisions on the COMPAS score, releasing the fraction of defendants with the lowest score according to their leniency. Those who were given a negative decision had their outcome label hidden. The data was then split to training and test sets and a logistic regression model was built to predict two-year recidivism from categorised age, gender, the number of priors, degree of crime COMPAS screened for (felony/misdemeanour). These same features were used as an input for the counterfactual imputing method.

\paragraph{Results} The results of deploying the algorithms are presented in figure \ref{fig:results_compas}. From the figure we see that .... The mean absolute errors were $0.00493$ for contraction and $0.00409$ for the counterfactual-based imputation method.

\begin{figure}
%\centering
\includegraphics[width=\linewidth]{./img/sl_results_compas_error}
\caption{COMPAS data: Error w.r.t. True evaluation vs Acceptance rate, error bars denote standard deviations. (Preliminary figure) }
\label{fig:results_compas}
\end{figure}

\begin{itemize}
%\item COMPAS data set
%	\begin{itemize}
%	\item Size, availability, COMPAS scoring
%		\begin{itemize}
%		\item COMPAS general recidivism risk score is made to ,
%		\item The final data set comprises of 6172 subjects assessed at Broward county, California. The data was preprocessed to include only subjects assessed at the pretrial stage and (something about traffic charges).
%		\item Data was made available ProPublica.
%		\item Their analysis and results are presented in the original article "Machine Bias" in which they argue that the COMPAS metric assigns biased risk evaluations based on race.
%		\item Data includes the subjects' demographic information (incl. gender, age, race) and information on their previous offences. 
%		\end{itemize}
%	\item Subsequent modifications for analysis 
%		\begin{itemize}
%		\item We created 9 synthetic judges with leniencies 0.1, 0.2, ..., 0.9. 
%		\item Subjects were distributed to all the judges evenly and at random to enable comparison to contraction method
%		\item We employed similar decider module as explained in Lakkaraju's paper, input was the COMPAS Score 
%		\item As the COMPAS score is derived mainly from so it can be said to have external information available, not coded into the four above-mentioned variables. (quoted text copy-pasted from here)
%		\item Data was split to test and training sets
%		\item A logistic regression model was built to predict two-year recidivism from categorized age, gender, the number of priors, degree of crime COMPAS screened for (felony/misdemeanor)
%		\item We used these same variables as input to the CBI evaluator.
%		\end{itemize}
%	\item Results
%		\begin{itemize}
%		\item Results from this analysis are presented in figure X. In the figure we see that CBI follows the true evaluation curve very closely.
%		\item We can also deduce from the figure that if this predictive model was to be deployed, it wouldn't necessarily improve on the decisions made by these synthetic judges.
%		\end{itemize}
%	\end{itemize}
\item Catalonian data (this could just be for our method? Hide ~25\% of outcome labels and show that we can estimate the failure rate for ALL levels of leniency despite the leniency of this one judge is only 0.25) (2nd priority)
	\begin{itemize}
	\item Size, availability, RisCanvi scoring
	\item Subsequent modifications for analysis
	\item Results
	\end{itemize}
\end{itemize}

\section{Discussion}

\begin{itemize}
\item Conclusions 
\item Future work / Impact
\end{itemize}


% \textbf{Acknowledgments.}
%The computational resources must be mentioned. 

%\clearpage
% \balance
\bibliographystyle{ACM-Reference-Format}
\bibliography{biblio}
%\balancecolumns % GM June 2007

\end{document}