notes.tex

\documentclass[11pt,a4paper]{amsart}
\usepackage[margin=1in]{geometry}                % See geometry.pdf to learn the layout options. There are lots.
%\geometry{a4paper}                   % ... or letterpaper or a5paper or ... 
%\geometry{landscape}                % Activate for for rotated page geometry 
%\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{epstopdf}
\usepackage[hidelinks, colorlinks=true]{hyperref}
%\DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png}

\usepackage[normalem]{ulem} %tables
\useunder{\uline}{\ul}{}
\usepackage{multirow}

\usepackage{enumitem}

%\usepackage[toc,page]{appendix}
%
%\renewcommand\appendixtocname{Appendices -- Mostly old}
%\renewcommand\appendixpagename{Appendices -- Mostly old}

\usepackage{wrapfig} % wrap figures

\usepackage{booktabs}% http://ctan.org/pkg/booktabs
\newcommand{\tabitem}{~~\llap{\textbullet}~~}

\usepackage{pgf}
\usepackage{tikz}
\usepackage{tikz-cd}
\usetikzlibrary{arrows,automata, positioning}

\usepackage{algorithm}% http://ctan.org/pkg/algorithms
\usepackage{algorithmic}% http://ctan.org/pkg/algorithms
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Procedure:}}
\renewcommand{\algorithmicreturn}{\textbf{Return}}

\newcommand{\pr}{\mathbb{P}} % tn merkki
\newcommand{\D}{\mathcal{D}} % aineisto
\newcommand{\s}{\mathcal{S}} % "fancy S"
\newcommand{\M}{\mathcal{M}} % "fancy M"
\newcommand{\B}{\mathcal{B}} % "fancy B"
\newcommand{\RR}{\mathcal{R}} % supistusalgon R
\newcommand{\invlogit}{\text{logit}^{-1}} % supistusalgon R

\renewcommand{\descriptionlabel}[1]{\hspace{\labelsep}\textnormal{#1}}

\makeatletter
%Table of Contents
\setcounter{tocdepth}{3}

% Add bold to \section titles in ToC and remove . after numbers
\renewcommand{\tocsection}[3]{%
  \indentlabel{\@ifnotempty{#2}{\bfseries\ignorespaces#1 #2\quad}}\bfseries#3}
% Remove . after numbers in \subsection
\renewcommand{\tocsubsection}[3]{%
  \indentlabel{\@ifnotempty{#2}{\ignorespaces#1 #2\quad}}#3}
%\let\tocsubsubsection\tocsubsection% Update for \subsubsection
%...

\newcommand\@dotsep{4.5}
\def\@tocline#1#2#3#4#5#6#7{\relax
  \ifnum #1>\c@tocdepth % then omit
  \else
    \par \addpenalty\@secpenalty\addvspace{#2}%
    \begingroup \hyphenpenalty\@M
    \@ifempty{#4}{%
      \@tempdima\csname r@tocindent\number#1\endcsname\relax
    }{%
      \@tempdima#4\relax
    }%
    \parindent\z@ \leftskip#3\relax \advance\leftskip\@tempdima\relax
    \rightskip\@pnumwidth plus1em \parfillskip-\@pnumwidth
    #5\leavevmode\hskip-\@tempdima{#6}\nobreak
    \leaders\hbox{$\m@th\mkern \@dotsep mu\hbox{.}\mkern \@dotsep mu$}\hfill
    \nobreak
    \hbox to\@pnumwidth{\@tocpagenum{\ifnum#1=1\bfseries\fi#7}}\par% <-- \bfseries for \section page
    \nobreak
    \endgroup
  \fi}
\AtBeginDocument{%
\expandafter\renewcommand\csname r@tocindent0\endcsname{0pt}
}
\def\l@subsection{\@tocline{2}{0pt}{2.5pc}{5pc}{}}
\makeatother

%\makeatletter
%\def\listofalgorithms{\@starttoc{loa}\listalgorithmname}
%\def\l@algorithm{\@tocline{0}{3pt plus2pt}{0pt}{1.9em}{}}
%\renewcommand{\ALG@name}{AlGoRiThM}
%\renewcommand{\listalgorithmname}{List of \ALG@name s}
%\makeatother

\usepackage{subcaption}
\graphicspath{ {../figures/} }

\title[Potential outcomes in model evaluation]{Notes, working title: \\ From would-have-beens to should-have-beens: Potential outcomes in model evaluation}

\author{RL}
%\date{}                                           % Activate to display a given date or no date

\begin{document}

\maketitle

\tableofcontents

\begin{abstract}
%This document presents the implementations of RL in pseudocode level. First, I present most of the nomenclature used in these notes. Then I proceed to give my personal views and comments on the motivation behind Selective labels paper. In sections \ref{sec:framework} and \ref{sec:modular_framework}, I present two frameworks for the selective labels problems. The latter of these has subsequently been established the current framework, but former is included for documentation purposes. In the following sections, I present the data generating algorithms and algorithms for obtaining failure rates using different methods in the old framework. Results are presented in section \ref{sec:results}. All the algorithms used in the modular framework are presented with motivations in section \ref{sec:modules}. 

For notes of the meetings please refer to the \href{https://helsinkifi-my.sharepoint.com/personal/rikulain_ad_helsinki_fi/Documents/Meeting_Notes.docx?web=1}{{\ul word document}}. Old discussions have been moved to the appendices.

\end{abstract}

\section*{Terms and abbreviations}

\begin{description}
\item[R :] acceptance rate, leniency of decision maker, $r \in [0, 1]$
\item[X :] personal features, observable to a predictive model
\item[Z :] some features of a subject, unobservable to a predictive model, latent variable
\item[W :] noise added to result variable Y
\item[T :] decision variable, bail/positive decision equal to 1, jail/negative decision equal to 0
\item[Y :] result variable, no crime/positive result equal to 1, crime/negative result equal to 0
\item[MAE :] Mean absolute error
\item[SL :] Selective labels, for more information see Lakkaraju's paper  \cite{lakkaraju17}
\item[Labeled data :] data that has been censored, i.e. if negative decision is given (T=0), then Y is set to NA.
\item[Full data :] data that has all labels available, i.e. \emph{even if} negative decision is given (T=0), Y will still be available.
\item[Unobservables :] unmeasured confounders, latent variables, Z
\end{description}

Mnemonic rule for the binary coding: zero bad (crime or jail), one good!

\section{Introduction}

\begin{wrapfigure}{r}{0.3\textwidth} %this figure will be at the right
    \centering
    \begin{tikzpicture}[->,>=stealth',node distance=1.5cm, semithick]

  \tikzstyle{every state}=[fill=none,draw=black,text=black]

  \node[state] (R)                    {$R$};
  \node[state] (X) [right of=R] {$X$};
  \node[state] (T) [below of=X] {$T$};
  \node[state] (Z) [rectangle, right of=X] {$Z$};
  \node[state] (Y) [below of=Z] {$Y$};

  \path (R) edge (T)
        (X) edge (T)
	     edge (Y)
        (Z) edge (T)
	     edge (Y)
        (T) edge (Y);
\end{tikzpicture}
\caption{Initial model.}
\label{fig:initial_model}
\end{wrapfigure}

Complex computational algorithms have already been deployed in numerous fields such as healthcare and justice to make influential decisions. These machine decisions have a great impact on human lives and therefore they should be audited to show that they improve on human decision-making. In the conventional settings, outcome labels are readily available and performance evaluation is trivial and hence numerous metrics have been proposed in the literature (ROC, AUC etc.). In settings as judicial bail decisions, some outcomes can not be observed due to the selective labeling of the data. This results in a complicated missing data problem where the missingness of an item is connected with its outcome and where the available labels are a non-random sample of the underlying population. Approaches addressing this problem can be separated into two major classes: imputation approaches and other approaches.

%In this paper we propose a novel framework for presenting these missing data problems and present an approach for inferring the missing labels to evaluate the performance of predictive models. We use a flexible Bayesian approach and etc. Additionally we show that our method is robust against violations and modifications in the data generating mechanisms.

%\begin{itemize}
%\item Evaluating predictive models is important. Models are even now deployed in the fields such as X, Y and Z to make influential decisions.
%\item Missing data problems ? Multiple kinds, MAR, MNAR, MCAR, what not. In the current setting data missingness is correlated with the outcome. Labeling has been performed by expert decision-makers wanting to avoid negative outcomes. This causes non-random missingness which has been recently been called selective labels/-ing.
%\item Approaches to this problem can roughly be divided into three categories: ignore, impute and other methods. Imputation methods X, Y, Z. Other is contraction presented by Lakkaraju et al which utilises...
%\item Several nuances to this setting a more relaxed setting in Jung et al preprint where counterfactuals were used to construct optimal policies. There wasn't selective labels.
%\item in addition selective labels.
%\item Interest in presenting a method for robust evaluation of predictive algorithms using potential outcomes.
%\end{itemize}

\section{Framework}

$$DG+DEC = DATA \rightarrow MODEL \rightarrow EVALUATION$$

In our framework, the flow of information is divided into three steps: data generation, modelling and evaluation steps. Our framework definition relies on precise definitions on the properties of each step. A step is fully defined when its input and output are unambiguously described.

The data generation step generates all the data, including the features and the selectively labeled outcomes of the subjects and the features of the deciders. In our setting we discuss mainly five different variables: subjects have feature X for observed features, Z for unobserved features (or \emph{unobservables} in short), and Y for the outcome. The deciders (judges, doctors) assigning the labels are defined by their leniency R and the decisions given to subjects T. The central variants for feature and decision generation are presented in section \ref{sec:modules}. (we have separated the decision process from the feature generation) The effects of variables on each other is presented as a directed graph in figure \ref{fig:initial_model}.

After the feature generation, the data is moved to a predictive model in the modelling step. The model, e.g. a regression model, is trained using one portion of the original data. Using the trained model we assign predictions for negative outcomes for the observations in the other part of the data. It is this model the performance of which we are interested in.

Finally, an evaluation algorithm tries to output a reliable estimate of the model's failure rate. A good evaluation algorithm gives a precise, unbiased and low variance estimate of the failure rate of the model with a given leniency $r$. As explained the setting is characterized by the \emph{selctive labeling} of the data.


\section{Potential outcomes in model evaluation}

Potential outcomes evaluator uses Stan to infer the latent variable and the path coefficients and to estimate expectation of Y for the missing outcomes. Full hierarchical model is presented in eq. \ref{eq:po} below. Expectations for the missing outcomes Y are obtained as \sout{point estimates from the maximum of the joint posterior} means of the posterior predictive distribution. (Joint posterior appeared to be bimodal or skewed and therefore the mean gave better estimates.)

Priors for the $\beta$ coefficients were chosen to be sufficiently non-informative without restricting the density estimation. Coefficients for the latent Z were restricted to be positive for posterior estimation. The  resulting distribution is the half-normal distribution ($X\sim N(0, 1) \Rightarrow Y=|X| \sim$ Half-Normal). The $\alpha$ intercepts are only for the decision variable to emulate the differences in leniency for each $M$ judge. Then subjects with equal x and z will have a different probability for bail given the judges leniency. 

\begin{align} \label{eq:po}
 Y~|~t,~x,~z,~\beta_{xy},~\beta_{zy} & \sim \text{Bernoulli}(\invlogit(\beta_{xy}x + \beta_{zy}z)) \\ \nonumber
 T~|~x,~z,~\beta_{xt},~\beta_{zt} & \sim \text{Bernoulli}(\invlogit(\alpha_j + \beta_{xt}x + \beta_{zt}z)) \\ \nonumber
 Z &\sim N(0, 1) \\ \nonumber
  \beta_{xt}, \beta_{xy} & \sim N(0, 10^2) \\ \nonumber
  \beta_{zt}, \beta_{zy} & \sim N_+(0, 10^2) \\ \nonumber
  p(\alpha_j) & \propto 1 \hskip1.0em \text{for } j \in \{1, 2, \ldots, M\}
\end{align}

Model is fitted on the test set and the expectations of potential outcomes $Y_{T=1}$ are used in place of missing outcomes.

\section{Results}

Here we present the results obtained from four core settings and four settings created for robustness check. Our core settings include situations where data is generated with the unobservables Z. Settings are characterized by the outcome creating mechanism and decision assignment.

The four main result figures are presented in figure set \ref{fig:results_bayes} with captions indicating the outcome and decision mechanisms. Figures showing the variance of the failure rate estimates are in appendix section \ref{sec:diagnostic}.

\begin{figure}[]
    \centering
    \begin{subfigure}[b]{0.475\textwidth}
        \includegraphics[width=\textwidth]{sl_result_bernoulli_independent}
        \caption{Outcome Y from Bernoulli, independent decisions using the quantiles.}
        %\label{fig:}
    \end{subfigure}
    \quad %add desired spacing between images, e. g. ~, \quad, \qquad, \hfill etc. 
      %(or a blank line to force the subfigure onto a new line)
    \begin{subfigure}[b]{0.475\textwidth}
        \includegraphics[width=\textwidth]{sl_result_threshold_independent}
        \caption{Outcome Y from threshold rule, independent decisions using the quantiles.}
        %\label{fig:}
    \end{subfigure}
    \begin{subfigure}[b]{0.475\textwidth}
        \includegraphics[width=\textwidth]{sl_result_bernoulli_batch}
        \caption{Outcome Y from Bernoulli, non-independent decisions.}
        %\label{fig:}
    \end{subfigure}
    \quad %add desired spacing between images, e. g. ~, \quad, \qquad, \hfill etc. 
      %(or a blank line to force the subfigure onto a new line)
    \begin{subfigure}[b]{0.475\textwidth}
        \includegraphics[width=\textwidth]{sl_result_threshold_batch}
        \caption{Outcome Y from threshold rule, non-independent decisions.}
        %\label{fig:}
    \end{subfigure}
    \caption{Core results: Failure rate vs. acceptance rate with varying levels of leniency. Different combinations of deciders and data generation mechanisms.}
    \label{fig:results_bayes}
\end{figure}

The robustness checks are presented in figure set \ref{fig:results_robustness} depicting deciders who
\begin{enumerate}[label=\Alph*)]
\item assign bail randomly with probability $r$, 
\item who favour and dislike some defendants with certain values of X
\item bad judge with a wrong $\beta_x$ 
\end{enumerate}
Last figure (D) is to illustrate the situation where the data generation mechanism is exactly the same as in model specified in eq. \ref{eq:po} with $\beta$ coefficients equal to 1 and $\alpha$ coefficients equal to 0.

The figures show that the contraction method is fairly robust to changes in data and decision generation although it has some variance. The potential outcomes approach consistently underestimates the true failure rate resulting in mediocre performance (compared to the imputation algorithms presented in the SL paper). The mean absolute errors of contraction with regard to true evaluation were in the order of 0.009...0.005 compared to our MAEs of approximately 0.015...0.035. Worrying aspect of the analysis is that even if the data generating process follows exactly the specified hierarchical model, the failure rate is still approximately 0.015.

Notable is that the diagnostic figures in section \ref{sec:diagnostic} show that few failure rate estimates are too high showing as "flyers/outliers" implying problems in model identifiability, probably as in bimodal posterior. (In the output when Stan tries to find the minimum of the negative log posterior (max of posterior) it occasionally converges to log probability of approximately -200...-400 when most of the times the minimum is at -7000...-9000.)

% The figures show that the potential outcomes approach estimates the true failure rate with a slightly better performance than the contraction algorithm (MAEs 0.00X...0.00Y compared to 0.00T...0.00U respectively). The results are in for only some of the core situations discussed, but will be provided for the Thursday meeting. The diagnostic figures in section \ref{sec:diagnostic} also show that the predictions given by the potential outcomes approach have a lower variance.

Imposing constraints to model in equation \ref{eq:po} did not yield significantly better results. The model was constrained so that there was only one intercept $\alpha$ and so that $\beta_{xt}=\beta_{xy}$ and $\beta_{zt}=\beta_{zy}$. Other caveats of the current approach is its scalability to large data sets.

\begin{figure}[H]
    \centering
    \begin{subfigure}[b]{0.475\textwidth}
        \includegraphics[width=\textwidth]{sl_result_random}
        \caption{Random decisions.}
        %\label{fig:}
    \end{subfigure}
    \quad %add desired spacing between images, e. g. ~, \quad, \qquad, \hfill etc. 
      %(or a blank line to force the subfigure onto a new line)
    \begin{subfigure}[b]{0.475\textwidth}
        \includegraphics[width=\textwidth]{sl_result_biased}
        \caption{Biased decisions.}
        \label{fig:bias}
    \end{subfigure}
    \begin{subfigure}[b]{0.475\textwidth}
        \includegraphics[width=\textwidth]{sl_result_bad}
        \caption{Bad judge with $\beta_x=0.2$.}
        %\label{fig:}
    \end{subfigure}
    \quad %add desired spacing between images, e. g. ~, \quad, \qquad, \hfill etc. 
      %(or a blank line to force the subfigure onto a new line)
    \begin{subfigure}[b]{0.475\textwidth}
        \includegraphics[width=\textwidth]{sl_result_bernoulli_bernoulli}
        \caption{Data generation as corresponding to model.}
        %\label{fig:}
    \end{subfigure}
    \caption{Robustness check figures: Failure rate vs. acceptance rate with varying levels of leniency. Different combinations of deciders and data generation mechanisms. Only one data set used, which affects the performance of contraction.}
    \label{fig:results_robustness}
\end{figure}

\begin{thebibliography}{9} % Might have been apa

\bibitem{lakkaraju17} 
   Lakkaraju, Himabindu. The Selective Labels Problem: Evaluating Algorithmic Predictions in the Presence of Unobservables. 2017. 

\end{thebibliography}

%\end{document}

\newpage

%\begin{appendices}
\appendix

\section{RL's comments on the selective labels paper} \label{sec:comments}

\emph{This chapter is to present my comments and insight regarding the topic.}

The motivating idea behind the SL paper of Lakkaraju et al. \cite{lakkaraju17} is to evaluate if machines could improve on human performance. In general case, comparing the performance of human and machine evaluations is simple. In the domains addressed by Lakkaraju et al. simple comparisons would be unethical and therefore algorithms are required.

The general idea of the SL paper is to train some predictive model with selectively labeled data. The question is then "how would this predictive model perform if it was to make independent bail-or-jail decisions?" That quantity can not be calculated from real-life data sets due to the ethical reasons and hidden labels. We can however use more selectively labeled data to estimate it's performance. But, because the available data is biased, the performance estimates are too good or "overly optimistic" if they are calculated in the conventional way ("labeled outcomes only"). This is why they are proposing the contraction algorithm.

One of the concepts to denote when reading the Lakkaraju paper is the difference between the global goal of prediction and the goal in this specific setting. The global goal is to have a low failure rate with high acceptance rate, but at the moment we are not interested in it. The goal in this setting is to estimate the true failure rate of the model with unseen biased data. That is, given only selectively labeled data and an arbitrary black-box model $\B$ we are interested in predicting the performance of model $\B$ in the whole data set with all ground truth labels.

On acceptance rate R: We discussed how Lakkaraju's paper treats variable R in a seemingly non-sensical way, it is as if a judge would have to let someone go today in order to detain some other defendant tomorrow to keep their acceptance rate at some $r$. A more intuitive way of thinking $r$ would be the "threshold perspective". That is, if a judge sees that a defendant has probability $p_x$ of committing a crime if let out, the judge would detain the defendant if $p_x > r$, the defendant would be too dangerous to let out. Now, it can be claimed that judges do have a threshold, even in Lakkaraju's setting. Let $c$ be a judge's acceptance threshold. Now decision $T$ can be defined with function $f_T(x, z)$ as follows:

\begin{equation}
  \pr(T=0|x, z)=\begin{cases}
    p_T, & \text{if $f_T(x, z) < c$}\\
    1-p_T, & \text{otherwise}.
  \end{cases}
\end{equation}

If $c$ is defined so that the ratio of positive decisions to all decisions will be equal to $r$, we will arrive at a similar data generation process as Lakkaraju  and as is presented in algorithm \ref{alg:data_with_Z}.

Finally, chapters from Lakkaraju \cite{lakkaraju17} about counterfactual inference, see references from their paper [sic]:

\begin{quote}
There has also been some work on inferring labels using counterfactual inference techniques [9, 12, 16, 36, 38] and leveraging these estimates when computing standard evaluation metrics. However, counterfactual inference techniques explicitly assume that there are no unmeasured confounders (that is, no unobservable variables Z) that could affect the outcome Y. This assumption does not typically hold in cases where human decisions are providing data labels [7, 21]. Thus, the combination of two ingredients -- selective labels and non-trivial unobservables -- poses problems for these existing techniques.

Counterfactual inference. Counterfactual inference techniques have been used extensively to estimate treatment effects in observational studies. These techniques have found applications in a variety of fields such as machine learning, epidemiology, and sociology [3, 8--10, 30, 34]. Along the lines of Johansson et al. [16], counterfactual inference techniques can be broadly categorized as: (1) parametric methods which model the relationship between observed features, treatments, and outcomes. Examples include any type of regression model such as linear and logistic regression, random forests and regression trees [12, 33, 42]. (2) non-parametric methods such as propensity score matching, nearest-neighbor matching, which do not explicitly model the relationship between observed features, treatments, and outcomes [4, 15, 35, 36, 41]. (3) doubly robust methods which combine the two aforementioned classes of techniques typically via a propensity score weighted regression [5, 10]. The effectiveness of parametric and non-parametric methods depends on the postulated regression model and the postulated propensity score model respectively. If the postulated models are not identical to the true models, then these techniques result in biased estimates of outcomes. Doubly robust methods require only one of the postulated models to be identical to the true model in order to generate unbiased estimates. However, due to the presence of unobservables, we cannot guarantee that either of the postulated models will be identical to the true models.
\end{quote}

\section{Framework definition -- 13 June discussion} \label{sec:framework}

\emph{In this section we define some key terms and concepts and derive a more unambiguous framework for the selective labels problem. The framework is presented in writing and as a picture in figures \ref{fig:framework} and \ref{fig:framework_data_flow}.}

First, data is generated through a \textbf{data generating process (DGP)}. DGP comprises of generating the private features for the subjects, generating the acceptance rates for the judges and assigning the subjects to the judges. \textbf{Acceptance rate (AR)} is defined as the ratio of positive decisions to all decisions that a judge will give. As a formula \[ AR = \dfrac{\#\{Positive~decisions\}}{\#\{Decisions\}}. \]   Data generating process is depicted in the first box of Figure \ref{fig:framework}.

Next, the all of the generated data goes to the \textbf{labeling process}. In the labeling process, it is determined which instances of the data will have an outcome label available. This is done by humans and is presented in lines 5--7 of algorithm \ref{alg:data_without_Z} and 5--8 of algorithm \ref{alg:data_with_Z}. The data is then split randomly into training and test datasets, $\D_{train}$ and $\D_{test}$ respectively.

After labeling, the labeled training data is used to train a machine that will either make decisions or predictions using some features of the data. Then, the test data will be given to the machine and it will output either binary decisions (yes/no), probabilities (a real number in interval $[0, 1]$) or a metric for ordering for all the instances in the test data set. The machine will be denoted with $\M$.

Finally the decisions and/or predictions made by the machine $\M$ and human judges (see dashed arrow in figure \ref{fig:framework}) will be evaluated using an \textbf{evaluation algorithm}. Evaluation algorithms will take the decisions, probabilities or ordering generated in the previous steps as input and then output an estimate of the machine's failure rate in the test data. \textbf{Failure rate (FR)} is defined as the ratio of undesired outcomes to given decisions. One special characteristic of FR in this setting is that a failure can only occur with a positive decision. More explicitly \[ FR = \dfrac{\#\{Failures\}}{\#\{Decisions\}}. \] Second characteristic of FR is that the number of positive decisions and therefore FR itself can be controlled through acceptance rate defined above.

Given the above framework, the goal is to create an evaluation algorithm that can accurately estimate the failure rate of any model $\M$ if it were to replace human decision makers in the labeling process. The estimations have to be made using only data that human decision-makers have labeled. The failure rate has to be accurately estimated for various levels of acceptance rate. The accuracy of the estimates can be compared by computing e.g. mean absolute error w.r.t the estimates given by \nameref{alg:true_eval} algorithm.

\begin{figure} [H]
\centering
\begin{tikzpicture}[->, >=stealth, shorten >=1pt, auto, node distance=1.5cm, semithick]

  \tikzstyle{every state}=[fill=none, draw=black, text=black, rectangle, minimum width=6.0cm]

  \node[state] (DG) {Data generation};
  \node[state] (LP) [below of=DG] {Labeling process (human)};
  \node[state] (MP) [below of=LP] {Machine predictions};
  \node[state] (EA)  [below of=MP] {Evaluation algorithm};

  \path (DG) edge (LP)
                    edge [bend right=90, dashed] node [left] {(1)} (MP)
        (LP) edge (MP)
             edge [bend left=90, dashed] node {(2)} (EA)
        (MP) edge (EA);
\end{tikzpicture}
\caption{The selective labels framework. Dashed arrow (1) marks the flow of unlabeled data to \nameref{alg:true_eval} algorithm and (2) indicates how human evaluations are evaluated without machine intervention using \nameref{alg:human_eval} algorithm.}
\label{fig:framework}
\end{figure}

\begin{figure} [H]
\centering
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,node distance=1.5cm,
                            semithick]
        
          \tikzstyle{every state}=[fill=none,draw=black,text=black, rectangle, minimum width=6.0cm]
        
          \node[state] (DG) {Data generation};
          \node[state] (LP) [below of = DG] {Labeling process (human)};
          \node[state] (MT) [below left=1.0cm and -4cm of LP] {Model training};
          \node[state] (MP) [below=1.0cm of MT] {Machine predictions};
          \node[state] (EA)  [below right=0.75cm and -4cm of MP] {Evaluation algorithm};
        
          \path (DG) edge (LP)
                            edge [out=180, in=180, dashed] node [left]  {$\D_{test,~unlabeled}$ (1)}   (MP)
                (LP) edge [bend right=19] node [left] {$\D_{train}$} (MT)
                      edge [bend left=60] node [right]  {$\D_{test}$} (MP)
                     edge [bend left=75, dashed] node [right]  {$\D_{test}$ (2)}  (EA)
                (MT) edge node {$\M$} (MP)
                (MP) edge (EA);
        \end{tikzpicture}