diff --git a/analysis_and_scripts/notes.tex b/analysis_and_scripts/notes.tex index 449aeb5d1eb3b8bc7d5e8112d3eab86921d369d7..98cdf0667dca51915e617568d0319212eb906f7c 100644 --- a/analysis_and_scripts/notes.tex +++ b/analysis_and_scripts/notes.tex @@ -235,8 +235,11 @@ Given the above framework, the goal is to create an evaluation algorithm that ca \label{fig:framework_data_flow} \end{figure} + \section{Modular framework -- based on 19 June discussion} \label{sec:modular_framework} +\emph{Below is the framework as was written on the whiteboard, then RL presents his own remarks on how he understood this.} + \begin{wrapfigure}{r}{0.25\textwidth} %this figure will be at the right \centering \begin{tikzcd} @@ -248,7 +251,6 @@ Given the above framework, the goal is to create an evaluation algorithm that ca \label{fig:dgm} \end{wrapfigure} -\emph{Below is the framework as was written on the whiteboard, then RL presents his own remarks on how he understood this.} ~ \\ \begin{description} @@ -543,7 +545,7 @@ Causal model, ep & 0.000598624 & 0.0411532\\ \caption{Results with unobservables, $\beta_Z=1$.} \label{fig:results_with_Z} \end{subfigure} - \caption{Failure rate vs. acceptance rate with varying levels of leniency. Logistic regression was trained on labeled training data. $N_{iter}$ was set to 8. \emph{RL: Updated 26 June.}} + \caption{Failure rate vs. acceptance rate with varying levels of leniency. Logistic regression was trained on labeled training data. \emph{RL: Updated 26 June.}} \label{fig:results} \end{figure} @@ -567,7 +569,7 @@ The disparities between figures \ref{fig:results_without_Z} and \ref{fig:betaZ_0 \caption{Results with unobservables, $\beta_Z$ set to 0 in algorithm \ref{alg:data_with_Z}.} \label{fig:betaZ_0} \end{subfigure} - \caption{Effect of $\beta_z$. Failure rate vs. acceptance rate with unobservables in the data (see algorithm \ref{alg:data_with_Z}). Logistic regression was trained on labeled training data. Results from algorithm \ref{alg:perf_comp} with $N_{iter}=4$.} + \caption{Effect of $\beta_z$. Failure rate vs. acceptance rate with unobservables in the data (see algorithm \ref{alg:data_with_Z}). Logistic regression was trained on labeled training data. Results from algorithm \ref{alg:perf_comp}.} \label{fig:betaZ_comp} \end{figure} @@ -590,14 +592,14 @@ In this section the predictive model was switched to random forest classifier to \centering \begin{subfigure}[b]{0.475\textwidth} \includegraphics[width=\textwidth]{sl_withoutZ_4iter_randomforest} - \caption{Results without unobservables with \\$N_{iter}=4$.} + \caption{Results without unobservables.} \label{fig:results_without_Z_rf} \end{subfigure} \quad %add desired spacing between images, e. g. ~, \quad, \qquad, \hfill etc. %(or a blank line to force the subfigure onto a new line) \begin{subfigure}[b]{0.475\textwidth} \includegraphics[width=\textwidth]{sl_withZ_6iter_betaZ_1_0_randomforest} - \caption{Results with unobservables, $\beta_Z=1$ and \\$N_{iter}=6$.} + \caption{Results with unobservables, $\beta_Z=1$.} \label{fig:results_with_Z_rf} \end{subfigure} \caption{Failure rate vs. acceptance rate with varying levels of leniency. Random forest classifier was trained on labeled training data} @@ -623,14 +625,14 @@ Given our framework defined in section \ref{sec:framework}, the results presente \centering \begin{subfigure}[b]{0.475\textwidth} \includegraphics[width=\textwidth]{sl_without_Z_15iter_random_model} - \caption{Failure rate vs. acceptance rate. Data without unobservables and $N_{iter}=15$. Machine predictions with random model.} + \caption{Failure rate vs. acceptance rate. Data without unobservables. Machine predictions with random model.} \label{fig:random_predictions_without_Z} \end{subfigure} \quad %add desired spacing between images, e. g. ~, \quad, \qquad, \hfill etc. %(or a blank line to force the subfigure onto a new line) \begin{subfigure}[b]{0.475\textwidth} \includegraphics[width=\textwidth]{sl_with_Z_15iter_fully_random_model} - \caption{Failure rate vs. acceptance rate. Data with unobservables and $N_{iter}=15$. Machine predictions with random model.} + \caption{Failure rate vs. acceptance rate. Data with unobservables. Machine predictions with random model.} \label{fig:random_predictions_with_Z} \end{subfigure} \caption{Failure rate vs. acceptance rate with varying levels of leniency. Machine predictions were done with completely random model, that is prediction $P(Y=0|X=x)=0.5$ for all $x$.} @@ -661,26 +663,26 @@ Monte Carlo & 0.001292 & 0.016629 & 0.009429 & 0.0179825\\ \centering \begin{subfigure}[b]{0.475\textwidth} \includegraphics[width=\textwidth]{sl_with_Z_10iter_coinflip_quantile_defaults_mc} - \caption{Outcome Y from Bernoulli, independent decisions using the quantiles and $N_{iter}=10$.} + \caption{Outcome Y from Bernoulli, independent decisions using the quantiles.} %\label{fig:modules_mc_without_Z} \end{subfigure} \quad %add desired spacing between images, e. g. ~, \quad, \qquad, \hfill etc. %(or a blank line to force the subfigure onto a new line) \begin{subfigure}[b]{0.475\textwidth} \includegraphics[width=\textwidth]{sl_with_Z_20iter_threshold_quantile_defaults_mc} - \caption{Outcome Y from threshold rule, independent decisions using the quantiles and $N_{iter}=20$.} + \caption{Outcome Y from threshold rule, independent decisions using the quantiles.} %\label{fig:modules_mc_with_Z} \end{subfigure} \begin{subfigure}[b]{0.475\textwidth} \includegraphics[width=\textwidth]{sl_with_Z_10iter_coinflip_lakkarajudecider_defaults_mc} - \caption{Outcome Y from Bernoulli, non-independent decisions and $N_{iter}=10$.} + \caption{Outcome Y from Bernoulli, non-independent decisions.} %\label{fig:modules_mc_without_Z} \end{subfigure} \quad %add desired spacing between images, e. g. ~, \quad, \qquad, \hfill etc. %(or a blank line to force the subfigure onto a new line) \begin{subfigure}[b]{0.475\textwidth} \includegraphics[width=\textwidth]{sl_with_Z_10iter_threshold_lakkarajudecider_defaults_mc} - \caption{Outcome Y from threshold rule, non-independent decisions and $N_{iter}=10$.} + \caption{Outcome Y from threshold rule, non-independent decisions.} %\label{fig:modules_mc_with_Z} \end{subfigure} \caption{Failure rate vs. acceptance rate with varying levels of leniency. Different combinations of deciders and data generation modules. See other modules used in section \ref{sec:modules_mc}} @@ -804,9 +806,13 @@ For example, a decision-maker with leniency 0.60 gets a new subject $\{x, z\}$ w \subsection{Evaluator modules} -Evaluator modules take some version of data as input and output an estimate of the failure given the input. +Evaluator modules take some version of data as input and output an estimate of the failure rate given the input. More discussion on the evaluator module is in section \ref{sec:modular_framework}. -Motivation for the contraction algorithm is presented in Lakkaraju's paper \cite{lakkaraju17}. The algorithm below is a slight modification of it to incorporate model $\B$. The original algorithm has been copied to algorithm \ref{alg:contraction}. +Motivation for the contraction algorithm is presented in Lakkaraju's paper \cite{lakkaraju17}. The original algorithm has been copied to algorithm \ref{alg:contraction}. The algorithm below has slight modifications to incorporate model $\B$ which tries to predict Y from x. About $\B$ from Lakkaraju: + +\begin{quote} +Black box predictive model. Another input to our framework is a black box predictive model $\B$ which assigns risk scores to observations in $\D$. More specifically, $\B$ is a function which maps the characteristics (or feature values) $\mathbf{x}$ of an observation in $\D$ to a probability score $s \in [0, 1]$. This score indicates how confident the model is in assigning the observation to $t = 0$ (e.g., denying bail). +\end{quote} \begin{algorithm}[H] % enter the algorithm environment \caption{Evaluator module: Contraction algorithm \cite{lakkaraju17}} % give the algorithm a caption @@ -816,24 +822,20 @@ Motivation for the contraction algorithm is presented in Lakkaraju's paper \cite \ENSURE \STATE Split data to test set and training set. \STATE Train a predictive model $\B$ on training data. -\STATE Estimate probability scores $\s$ using $\B$ for all observations in test data and attach to test data. +\STATE Estimate and assign probability scores $\s$ using $\B$ for all observations in test data. \STATE Let $q$ be the decision-maker with highest acceptance rate in $\D$. \STATE $\D_q = \{(x, j, t, y) \in \D|j=q\}$ -\STATE \hskip3.0em $\rhd$ $\D_q$ is the set of all observations judged by $q$ \STATE $\RR_q = \{(x, j, t, y) \in \D_q|t=1\}$ -\STATE \hskip3.0em $\rhd$ $\RR_q$ is the set of observations in $\D_q$ with observed outcome labels \STATE Sort observations in $\RR_q$ in descending order of confidence scores $\s$ and assign to $\RR_q^{sort}$. -\STATE \hskip3.0em $\rhd$ Observations deemed as high risk by the black-box model $\mathcal{B}$ are at the top of this list \STATE Remove the top $[(1.0-r)|\D_q |]-[|\D_q |-|\RR_q |]$ observations of $\RR_q^{sort}$ and call this list $\mathcal{R_B}$ -\STATE \hskip3.0em $\rhd$ $\mathcal{R_B}$ is the list of observations assigned to $t = 1$ by $\mathcal{B}$ \STATE Compute $\mathbf{u}=\sum_{i=1}^{|\mathcal{R_B}|} \dfrac{\delta\{y_i=0\}}{| \D_q |}$. \RETURN $\mathbf{u}$ \end{algorithmic} \end{algorithm} -%te eroaa muista algoritmeista / eriyispiirteenä että se saa käyttöönsä myös piilotetut outcomet. +True evaluation module computes the "true failure rate" of a predictive model \emph{had it been deployed to make independent decisions}. For computing the true failure rate "had the model been deployed" we need all outcome labels which is why the true failure rate can only be computed on synthetic data. -True evaluation module computes the "true failure rate" of a predictive model had it been deployed to make independent decisions. The module first assigns each observation with a predicted +In practice, the module first trains a model $\B$ and assigns each observation with a probability score $\s$ using it as described above. Then the observations are sorted in ascending order by the scores so that most risky subjects are last (subjects with the highest predicted probability for a negative outcome). Now when taking the first $r \cdot 100\%$ of observations the true failure rate can be computed straight from the ground truth. \begin{algorithm}[H] % enter the algorithm environment \caption{Evaluator module: True evaluation} % give the algorithm a caption @@ -843,15 +845,18 @@ True evaluation module computes the "true failure rate" of a predictive model ha \ENSURE \STATE Split data to test set and training set. \STATE Train a predictive model $\B$ on training data. -\STATE Estimate probability scores $\s$ using $\B$ for all observations in test data and attach to test data. +\STATE Estimate and assign probability scores $\s$ using $\B$ for all observations in test data. \STATE Sort the data by the probabilities $\s$ to ascending order. -\STATE \hskip3.0em $\rhd$ Now the most dangerous subjects are last. \STATE Calculate the number to release $N_{free} = |\D| \cdot r$. \RETURN $\frac{1}{|\D|}\sum_{i=1}^{N_{free}}\delta\{y_i=0\}$ \end{algorithmic} \end{algorithm} -%lo kuten te, mutta määrätyt Y:t on piilotettu +The labeled outcomes evaluator can be described to be the "vanilla estimate of the failure rate". Using probability scores $\s$ from $\B$, the estimate of failure rate is computed straight from the available labels. From Lakkaraju: + +\begin{quote} +Labeled Outcomes Only: To plot this curve, we first obtain all the subjects whose outcome labels are available in the evaluation set and rank them in ascending order based on the probability scores assigned by the predictive model. We then simulate the model at various values of acceptance rates r by assigning the observations corresponding to the top r fraction of the sorted list to yes decisions (t = 1). We then compute the failure rate on the observations assigned to yes decisions directly from their corresponding ground truth labels. +\end{quote} \begin{algorithm}[H] % enter the algorithm environment \caption{Evaluator module: Labeled outcomes} % give the algorithm a caption @@ -861,7 +866,7 @@ True evaluation module computes the "true failure rate" of a predictive model ha \ENSURE \STATE Split data to test set and training set. \STATE Train a predictive model $\B$ on training data. -\STATE Estimate probability scores $\s$ using $\B$ for all observations in test data and attach to test data. +\STATE Estimate and assign probability scores $\s$ using $\B$ for all observations in test data. \STATE Assign observations in test data with observed outcomes (T=1) to $\D_{observed}$. \STATE Sort $\D_{observed}$ by the probabilities $\s$ to ascending order. \STATE \hskip3.0em $\rhd$ Now the most dangerous subjects are last. @@ -870,42 +875,90 @@ True evaluation module computes the "true failure rate" of a predictive model ha \end{algorithmic} \end{algorithm} +The performance of human decision-makers (this is the decider in the modular framework) is evaluated with algorithm \ref{alg:eval:human_eval}. Following quite comprehensive description from Lakkaraju: + +\begin{quote} +This [failure rate estimation] can be done by grouping decision-makers with similar values of acceptance rate into bins and treating each bin as a single hypothetical decision-maker. We can then compute the failure rate and acceptance rate values for each such bin and plot them as a curve. We refer to this curve as the human evaluation curve. +\end{quote} + \begin{algorithm}[H] % enter the algorithm environment \caption{Evaluator module: Human evaluation} % give the algorithm a caption \label{alg:eval:human_eval} % and a label for \ref{} commands later in the document \begin{algorithmic}[1] % enter the algorithmic environment -\REQUIRE Data $\D$ with properties $\{x_i, j_i, t_i, y_i\}$, acceptance rate r +\REQUIRE Data $\D$ with properties $\{j_i, t_i, y_i\}$, acceptance rate r \ENSURE -\STATE \emph{Split data to test set and training set and discard the training set.} \STATE Assign judges with acceptance rate in $[r-0.05, r+0.05]$ to $\mathcal{J}$ -\STATE $\D_{released} = \{(x, j, t, y) \in \D~|~t=1 \wedge j \in \mathcal{J}\}$ +\STATE $\D_{released} = \{(j, t, y) \in \D~|~t=1 \wedge j \in \mathcal{J}\}$ \STATE \hskip3.0em $\rhd$ Subjects judged \emph{and} released by judges with correct leniency. \RETURN $\frac{1}{|\mathcal{J}|}\sum_{i=1}^{\D_{released}}\delta\{y_i=0\}$ \end{algorithmic} \end{algorithm} - -%kausaali evaluaattori kuten MM ensimmäisenä sen esitti +The initial approach to the selective labels problem is in algorithm \ref{alg:eval:causal_eval} below. In the algorithm, in addition to computing the probability scores $\s$ we also try to evaluate the quantity $F(x_0)$. We can describe $F$ to answer to the question "what fraction of observations have a lower risk than $x_0$?". Now if it is deemed that the fraction given by $F$ is lower than $r$, the subject is released and these decisions are stored in $T_{causal}$. Now the estimated failure rate is the sum of the predicted probability scores of those released (according to $F$) divided by the number of observations. \begin{algorithm}[H] % enter the algorithm environment -\caption{Evaluator module: Causal evaluator (?)} % give the algorithm a caption +\caption{Evaluator module: Causal evaluator} % give the algorithm a caption \label{alg:eval:causal_eval} % and a label for \ref{} commands later in the document \begin{algorithmic}[1] % enter the algorithmic environment \REQUIRE Data $\D$ with properties $\{x_i, t_i, y_i\}$, acceptance rate r \ENSURE \STATE Split data to test set and training set. \STATE Train a predictive model $\B$ on training data. -\STATE Estimate probability scores $\s$ using $\B$ for all observations in test data and attach to test data. -\FORALL{$i$ in $1, \ldots, N_{total}$} - \STATE Evaluate $F(x_i) = \int_{x\in\mathcal{X}} P_X(x)\delta(f(x)<f(x_i)) ~dx$ and assign to $\mathcal{F}_{predictions}$ +\STATE Estimate and assign probability scores $\s$ using $\B$ for all observations in test data. +\FORALL{observations in $\D_{test}$} + \STATE Evaluate $F(x_i) = \int_{x\in\mathcal{X}} P_x(x)\delta(\B(x)<\B(x_i)) ~dx$ and assign to $\mathcal{F}_{predictions}$ \ENDFOR \STATE Create boolean array $T_{causal} = \mathcal{F}_{predictions} < r$. -\RETURN $\frac{1}{|\D_{test}|}\sum_{i=1}^{|\D_{test}|} \s_i \cdot T_{i, causal}$ which is equal to $\frac{1}{|\D|}\sum_{x\in\D} f(x)\delta(F(x) < r)$ +\RETURN $\frac{1}{|\D_{test}|}\sum_{i=1}^{|\D_{test}|} \s[i] \cdot T_{causal}[i]$ which is equal to $\frac{1}{|\D|}\sum_{x\in\D} f(x)\delta(F(x) < r)$ \end{algorithmic} \end{algorithm} %alla oleva montecarlo perusajatus ennustaa Z ja sen perusteella imputoida Y. selitä kaikki ja yksinkertaista +The latest approach to the problem is presented below in algorithm \ref{alg:eval:mc}. The high-level idea is to use the counterfactual outcomes Y(1) where the outcomes are missing and then compute the failure rate. To infer the probability $\pr(Y(1)=0)$, we need to make some inference about the latent variable Z. We can always infer some information about Z. If features X and the decision T are contradictory, we can infer more about Z than when X and T are aligned. + +The algorithm is based on the following equation which expresses the posterior probability of Z after observing T, X and R: +\begin{equation} \label{eq:posterior_Z} +\pr(Z|T, X, R) \propto \pr(T|X, Z, R) \cdot \pr(Z). +\end{equation} +Here $\pr(Z)$ is a prior on $Z$, in this case a standard Gaussian. The second term can formally be written as +\begin{equation} \label{eq:Tprob} + \pr(T=t|x, z, r)=\begin{cases} + 1-t, & \text{if $\pr(Y=0|x, z, DG) \geq F^{-1}(r)$}\\ + t, & \text{otherwise} + \end{cases} +\end{equation} +in probabilities or just deterministically +\begin{equation} \label{eq:Tdet} + T=\begin{cases} + 0, & \text{if $\pr(Y=0|x, z, DG) \geq F^{-1}(r)$}\\ + 1, & \text{otherwise}. + \end{cases} +\end{equation} +In equations \ref{eq:Tprob} and \ref{eq:Tdet}, $\pr(Y=0|x, z, DG)$ is the predicted probability of a negative outcome given x and z. The probability $\pr(Y=0|x, z, DG)$ is predicted by the judge and here we used an approximation that +\begin{equation} +\pr(Y=0|x, z, DG) = \sigma(\beta_Xx+\beta_Zz) +\end{equation} +which is an increasing function of $z$ when $x$ is given. Now we do not know the $\beta$ coefficients so here we used the information that they are one. (In the future, they should be inferred.) + +The inverse cumulative function $F^{-1}(r)$ in equations \ref{eq:Tprob} and \ref{eq:Tdet} is the inverse cumulative distribution of \emph{logit-normal distribution} with parameters $\mu=0$ and $\sigma^2=2$, i.e. $F^{-1}$ is the inverse cumulative distribution function of the sum of two standard Gaussians after logistic transformation. If $\beta_X \neq 1$ and/or $\beta_Z \neq 1$ then from the basic properties of variance $\sigma^2=Var(\beta_XX+\beta_ZZ)=\beta_X^2Var(X)+\beta_Z^2Var(Z)$. Finally the inverse cumulative function +\begin{equation} +F^{-1}(r) = logit^{-1}\left(\text{erf}^{-1}(2r-1)\sqrt{2\sigma^2}-\mu\right) +\end{equation} +where the parameters are as discussed and erf is the error function. + +With this knowledge, it can be stated that if we observed $T=0$ with some $x$ and $r$ it must have been that $\sigma(\beta_Xx+\beta_Zz) \geq F^{-1}(r)$. Using basic algebra we obtain that +\begin{equation} \label{eq:bounds} +logit^{-1}(x + z) \geq F^{-1}(r) \Leftrightarrow x+z \geq logit(F^{-1}(r)) \Leftrightarrow z \geq logit(F^{-1}(r)) - x +\end{equation} +because the logit and its inverse are strictly increasing functions and hence preserve the order of magnitude for all pairs of values in their domains. From equations \ref{eq:posterior_Z}, \ref{eq:Tprob} and \ref{eq:bounds} we can conclude that $\pr(Z \geq logit^{-1}(F^{-1}(r)) - x | T=0, X=x, R=r) = 0$ and that elsewhere the distribution of Z follows a truncated Gaussian with an upper bound of $logit^{-1}(F^{-1}(r)) - x$. The expectation of Z can be computed analytically. All this follows analogically for cases with $T=1$ with the changes of some inequalities. + +In practise, in lines 1--3 and 10--13 of algorithm \ref{alg:eval:mc} we do as in the True evaluation evaluator algorithm with the distinction that some of the values of Y are imputed with the corresponding counterfactual probabilities. In line 4 we compute the bounds as motivated above. In the for-loop (lines 5--8) we merely compute the expectation of Z given the knowledge of the decision and that the distribution of Z follows a truncated Gaussian. Using the expectation, we then compute the probability for the counterfactual $\pr(Y(1) = 0)$ (probability of a negative outcome had a positive decision been given). The equation +\begin{equation} +\hat{z} = (1-t) \cdot E(Z | Z > Q_r) + t \cdot E(Z | Z < Q_r) +\end{equation} +computes the correct expectation then automatically. In line 9 the imputation can be performed in couple of ways: either by taking a random guess with probability $\pr(Y(1) = 0)$ or by assigning the most likely value for Y. + \begin{algorithm}[H] % enter the algorithm environment \caption{Evaluator module: Monte Carlo evaluator, imputation} % give the algorithm a caption \label{alg:eval:mc} % and a label for \ref{} commands later in the document @@ -914,21 +967,13 @@ True evaluation module computes the "true failure rate" of a predictive model ha \ENSURE \STATE Split data to test set and training set. \STATE Train a predictive model $\B$ on training data. -\STATE Estimate probability scores $\s$ using $\B$ for all observations in test data and attach to test data. -\STATE Sample $N_{sim}$ observations from a standard Gaussian and assign to Z. -\STATE Sample $N_{sim}$ observations from sum of two standard Gaussians (N(0, 2)) and assign to \texttt{quants}. -\STATE Transform the values of the samples in \texttt{quants} using the inverse of logit function. -\STATE Compute the values of the inverse cdf of the observations in \texttt{quants} for the acceptance rates r of each judge and assign to $Q_r$. -\FORALL{$i$ in $1, \ldots, N_{test}$} - \IF{$t_i = 0$} - \STATE{Take all $Z + \epsilon > logit(Q_{r,i})-x_i$ , where $\epsilon \sim N(0, 0.1)$.} - \ELSE - \STATE{Take all $Z + \epsilon < logit(Q_{r,i})-x_i$ , where $\epsilon \sim N(0, 0.1)$.} - \ENDIF - \STATE Compute $\bar{z}=\frac{1}{n}\sum z$ - \STATE Draw predictions $\hat{p}_{i,y}$ from Bernoulli($1-logit^{-1}(x_i+\bar{z})$) and assign to data. +\STATE Estimate and assign probability scores $\s$ using $\B$ for all observations in test data. +\STATE Compute bounds $Q_r = logit(F^{-1}(r)) - x$ for all judges. +\FORALL{observations in test set} + \STATE Compute expectation $\hat{z} = (1-t) \cdot E(Z | Z > Q_r) + t \cdot E(Z | Z < Q_r)$. % + \STATE Compute $\pr(Y(1) = 0) = logit^{-1}(x + \hat{z})$. \ENDFOR -\STATE Impute missing observations using $\hat{p}_y$. +\STATE Impute missing observations using the estimates $\pr(Y(1) = 0)$. \STATE Sort the data by the probabilities $\s$ to ascending order. \STATE \hskip3.0em $\rhd$ Now the most dangerous subjects are last. \STATE Calculate the number to release $N_{free} = |\D_{test}| \cdot r$. @@ -936,6 +981,8 @@ True evaluation module computes the "true failure rate" of a predictive model ha \end{algorithmic} \end{algorithm} +\newpage + \subsection{Summary table} Summary table of different modules. @@ -948,10 +995,10 @@ Summary table of different modules. \multicolumn{3}{c}{Module type} \\[.5\normalbaselineskip] \textbf{Data generator} & \textbf{Decider} & \textbf{Evaluator} \\ \midrule - {\ul Without unobservables} & Independent decisions & {\ul Labeled outcomes} \\ - & 1. draw T from a Bernoulli & \tabitem Data $\D$ with properties $\{x_i, t_i, y_i\}$ \\ - {\ul With unobservables} & with $P(T=0|X, Z)$ & \tabitem acceptance rate r \\ - \tabitem $P(Y=0|X, Z, W)$ & 2. determine with $F^{-1}(r)$ & \tabitem knowledge that X affects Y \\[.5\normalbaselineskip] + {\ul Without unobservables} & Independent decisions & {\ul Labeled outcomes} \\ + & 1. draw T from a Bernoulli & \tabitem Data $\D$ with properties $\{x_i, t_i, y_i\}$ \\ + {\ul With unobservables} & with $P(T=0|X, Z)$ & \tabitem acceptance rate r \\ + \tabitem $P(Y=0|X, Z, W)$ & 2. determine with $F^{-1}(r)$ & \tabitem knowledge that X affects Y \\[.5\normalbaselineskip] {\ul With unobservables} & Non-independent decisions & {\ul True evaluation} \\ \tabitem assign Y by & 3. sort by $P(T=0|X, Z)$ & \tabitem Data $\D$ with properties $\{x_i, t_i, y_i\}$ \\ @@ -968,7 +1015,7 @@ Summary table of different modules. & & \tabitem acceptance rate r \\ & & \tabitem knowledge that X affects Y \\[.5\normalbaselineskip] - & & {\ul Causal model (?)} \\ + & & {\ul Causal model} \\ & & \tabitem Data $\D$ with properties $\{x_i, t_i, y_i\}$ \\ & & \tabitem acceptance rate r \\ & & \tabitem knowledge that X affects Y \\[.5\normalbaselineskip] @@ -992,4 +1039,4 @@ Summary table of different modules. \end{thebibliography} -\end{document} \ No newline at end of file +\end{document} \ No newline at end of file diff --git a/figures/sl_diagnostic_bernoulli_batch_with_Z.png b/figures/sl_diagnostic_bernoulli_batch_with_Z.png new file mode 100644 index 0000000000000000000000000000000000000000..f9a41353995b9cfacde595b379a888b51295dba2 Binary files /dev/null and b/figures/sl_diagnostic_bernoulli_batch_with_Z.png differ diff --git a/figures/sl_diagnostic_bernoulli_independent_with_Z.png b/figures/sl_diagnostic_bernoulli_independent_with_Z.png new file mode 100644 index 0000000000000000000000000000000000000000..775110b517c579daa8d4843b071a6306941e21f2 Binary files /dev/null and b/figures/sl_diagnostic_bernoulli_independent_with_Z.png differ diff --git a/figures/sl_diagnostic_bernoulli_independent_without_Z.png b/figures/sl_diagnostic_bernoulli_independent_without_Z.png new file mode 100644 index 0000000000000000000000000000000000000000..92ef56ed557b50bd0b3829abab6d221f67223077 Binary files /dev/null and b/figures/sl_diagnostic_bernoulli_independent_without_Z.png differ diff --git a/figures/sl_diagnostic_threshold_batch_with_Z.png b/figures/sl_diagnostic_threshold_batch_with_Z.png new file mode 100644 index 0000000000000000000000000000000000000000..b5fb9da0e37c3b7d0040266df5b9123cfffe826a Binary files /dev/null and b/figures/sl_diagnostic_threshold_batch_with_Z.png differ diff --git a/figures/sl_diagnostic_threshold_independent_with_Z.png b/figures/sl_diagnostic_threshold_independent_with_Z.png new file mode 100644 index 0000000000000000000000000000000000000000..b66abfa91a44854560837ebd067283a80a17a804 Binary files /dev/null and b/figures/sl_diagnostic_threshold_independent_with_Z.png differ