From 3e84e4a6c226e41b2b1e89e3ffffe0a5d74b6ebc Mon Sep 17 00:00:00 2001
From: Riku-Laine <28960190+Riku-Laine@users.noreply.github.com>
Date: Tue, 6 Aug 2019 13:57:15 +0300
Subject: [PATCH] Outline writing, corrected some models. Defined stard error
 better.

---
 analysis_and_scripts/code_full.stan           |   73 ++
 ...ode_jung.stan => code_jung_binarized.stan} |   55 +-
 .../stan_modelling_empirical.py               |    2 +-
 ...modelling_empirical_SEfixed_usefeatures.py |  532 ++++++++
 ...delling_empirical_SEfixed_useprediction.py |  532 ++++++++
 .../stan_modelling_theoretic_SEfixed_useX.py  | 1101 +++++++++++++++++
 ...delling_theoretic_SEfixed_useprediction.py | 1101 +++++++++++++++++
 paper/sl.tex                                  |  167 ++-
 8 files changed, 3478 insertions(+), 85 deletions(-)
 create mode 100644 analysis_and_scripts/code_full.stan
 rename analysis_and_scripts/{code_jung.stan => code_jung_binarized.stan} (72%)
 create mode 100644 analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py
 create mode 100644 analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py
 create mode 100644 analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py
 create mode 100644 analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py

diff --git a/analysis_and_scripts/code_full.stan b/analysis_and_scripts/code_full.stan
new file mode 100644
index 0000000..5d484a3
--- /dev/null
+++ b/analysis_and_scripts/code_full.stan
@@ -0,0 +1,73 @@
+data {
+  int<lower=1> D;
+  int<lower=0> N_obs;
+  int<lower=0> N_cens;
+  int<lower=0> M;
+  real<lower=0> sigma_tau;
+  int<lower=1, upper=M> jj_obs[N_obs]; // judge_ID
+  int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID
+  int<lower=0, upper=1> dec_obs[N_obs];
+  int<lower=0, upper=1> dec_cens[N_cens];
+  row_vector[D] X_obs[N_obs];
+  row_vector[D] X_cens[N_cens];
+  int<lower=0, upper=1> y_obs[N_obs];
+}
+
+parameters {
+  vector[N_obs] Z_obs;
+  vector[N_cens] Z_cens;
+  real alpha_T[M];
+  vector[D] beta_XT_raw;
+  vector[D] beta_XY_raw;
+  real<lower=0> beta_ZT_raw; // jungimainen oletus latentin pos. vaik.
+  real<lower=0> beta_ZY_raw; // jungimainen oletus latentin pos. vaik.
+  
+  real<lower=0> tau_XT;
+  real<lower=0> tau_XY;
+  real<lower=0> tau_ZT;
+  real<lower=0> tau_ZY;
+
+}
+
+transformed parameters {
+  vector[D] beta_XT;
+  vector[D] beta_XY;
+  real<lower=0> beta_ZT; // jungimainen oletus latentin pos. vaik.
+  real<lower=0> beta_ZY; // jungimainen oletus latentin pos. vaik.
+
+  beta_XT = tau_XT * beta_XT_raw;
+  beta_XY = tau_XY * beta_XY_raw;
+  beta_ZT = tau_ZT * beta_ZT_raw;
+  beta_ZY = tau_ZY * beta_ZY_raw;
+}
+
+model {
+  Z_obs ~ normal(0, 1);
+  Z_cens ~ normal(0, 1);
+  
+  tau_XT ~ normal(0, sigma_tau);
+  tau_XY ~ normal(0, sigma_tau);
+  tau_ZT ~ normal(0, sigma_tau);
+  tau_ZY ~ normal(0, sigma_tau);
+  
+  beta_XT_raw ~ normal(0, 1);
+  beta_XY_raw ~ normal(0, 1);
+  beta_ZT_raw ~ normal(0, 1);
+  beta_ZY_raw ~ normal(0, 1);
+  
+  for(i in 1:N_obs){
+    dec_obs[i] ~ bernoulli_logit(alpha_T[jj_obs[i]] + X_obs[i] * beta_XT + beta_ZT * Z_obs[i]);
+    y_obs[i] ~ bernoulli_logit(X_obs[i] * beta_XY + beta_ZY * Z_obs[i]);
+  }
+  
+  for(i in 1:N_cens)
+    dec_cens[i] ~ bernoulli_logit(alpha_T[jj_cens[i]] + X_cens[i] * beta_XT + beta_ZT * Z_cens[i]);
+}
+
+generated quantities {
+  int<lower=0, upper=1> y_est[N_cens];
+  
+  for(i in 1:N_cens){
+    y_est[i] = bernoulli_logit_rng(X_cens[i] * beta_XY + beta_ZY * Z_cens[i]);
+  }
+}
diff --git a/analysis_and_scripts/code_jung.stan b/analysis_and_scripts/code_jung_binarized.stan
similarity index 72%
rename from analysis_and_scripts/code_jung.stan
rename to analysis_and_scripts/code_jung_binarized.stan
index 8ee2466..259293a 100644
--- a/analysis_and_scripts/code_jung.stan
+++ b/analysis_and_scripts/code_jung_binarized.stan
@@ -21,8 +21,8 @@ parameters {
   vector[N_cens] Z_cens;
   real alpha_T[M];  // Judge-specific intercepts
   
-  vector[D] beta_XT[K]; 
-  vector[D] beta_XY[K];
+  vector[D] beta_XT_raw[K]; 
+  vector[D] beta_XY_raw[K];
   
   vector<lower=0>[K] beta_ZT_raw; // Coefficient for the latent variable.
   vector<lower=0>[K] beta_ZY_raw; // Coefficient for the latent variable.
@@ -34,18 +34,36 @@ parameters {
 }
 
 transformed parameters{
+  vector[D] beta_XT[K]; 
+  vector[D] beta_XY[K];
+
   vector<lower=0>[K] beta_ZT_cumulative;
   vector<lower=0>[K] beta_ZY_cumulative;
   
   vector<lower=0>[K] beta_ZT;
   vector<lower=0>[K] beta_ZY;
   
-  beta_ZT[1] = beta_ZT_raw[1];
-  beta_ZY[1] = beta_ZY_raw[1];
-  
   if(K >= 2){
+    beta_XT[1] = beta_XT_raw[1];
+    beta_XY[1] = beta_XY_raw[1];
+    
+    for(i in 2:K){ // random walk prior here
+      beta_XT[i] = tau_XT * beta_XT_raw[i-1]; // ith group
+      beta_XY[i] = tau_XY * beta_XY_raw[i-1];
+    }
+    
+    beta_ZT[1] = beta_ZT_raw[1];
+    beta_ZY[1] = beta_ZY_raw[1];
+    
     beta_ZT[2:] = tau_ZT * beta_ZT_raw[2:];
     beta_ZY[2:] = tau_ZY * beta_ZY_raw[2:];
+    
+  } else {
+    // if only one group, constrain variances with the tau prior
+    beta_XT[1] = tau_XT * beta_XT_raw[1];
+    beta_XY[1] = tau_XY * beta_XY_raw[1];
+    beta_ZT = tau_ZT * beta_ZT_raw;
+    beta_ZY = tau_ZY * beta_ZY_raw;
   }
   
   beta_ZT_cumulative = cumulative_sum(beta_ZT);
@@ -53,25 +71,20 @@ transformed parameters{
 }
 
 model {
-  Z_obs  ~ std_normal();
-  Z_cens ~ std_normal();
+  Z_obs  ~ normal(0, 1);
+  Z_cens ~ normal(0, 1);
   
-  tau_XY ~ normal(0, sigma_tau);
   tau_XT ~ normal(0, sigma_tau);
-  tau_ZY ~ normal(0, sigma_tau);
+  tau_XY ~ normal(0, sigma_tau);
   tau_ZT ~ normal(0, sigma_tau);
+  tau_ZY ~ normal(0, sigma_tau);
   
-  beta_XT[1]  ~ std_normal(); // first group
-  beta_XY[1]  ~ std_normal();
-  beta_ZT_raw ~ std_normal();
-  beta_ZY_raw ~ std_normal();
-  
-  if(K >= 2){
-    for(i in 2:K){ // random walk prior here
-      beta_XT[i] ~ normal(beta_XT[i-1], tau_XT); // ith group
-      beta_XY[i] ~ normal(beta_XY[i-1], tau_XY);
-    }
+  for(i in 1:K){
+    beta_XT_raw[i] ~ normal(0, 1);
+    beta_XY_raw[i] ~ normal(0, 1);
   }
+  beta_ZT_raw ~ normal(0, 1);
+  beta_ZY_raw ~ normal(0, 1);
 
   for(i in 1:N_obs){
     dec_obs[i] ~ bernoulli_logit(alpha_T[jj_obs[i]] + X_obs[i] * beta_XT[kk_obs[i]] + beta_ZT_cumulative[kk_obs[i]] * Z_obs[i]);
@@ -83,9 +96,9 @@ model {
 }
 
 generated quantities {
-  real<lower=0, upper=1> y_est[N_cens];
+  int<lower=0, upper=1> y_est[N_cens];
   
   for(i in 1:N_cens){
-    y_est[i] = inv_logit(X_cens[i] * beta_XY[kk_cens[i]] + beta_ZY_cumulative[kk_cens[i]] * Z_cens[i]);
+    y_est[i] = bernoulli_logit_rng(X_cens[i] * beta_XY[kk_cens[i]] + beta_ZY_cumulative[kk_cens[i]] * Z_cens[i]);
   }
 }
diff --git a/analysis_and_scripts/stan_modelling_empirical.py b/analysis_and_scripts/stan_modelling_empirical.py
index e05c3a7..6ff23e3 100644
--- a/analysis_and_scripts/stan_modelling_empirical.py
+++ b/analysis_and_scripts/stan_modelling_empirical.py
@@ -459,7 +459,7 @@ dat = dict(D=1,
 
 sm = pystan.StanModel(file=stan_code_file_name)
 
-fit = sm.sampling(data=dat, chains=5, iter=4000, control = dict(adapt_delta=0.9))
+fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9))
 
 pars = fit.extract()
 
diff --git a/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py b/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py
new file mode 100644
index 0000000..18eb131
--- /dev/null
+++ b/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py
@@ -0,0 +1,532 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Author: Riku Laine
+# Date: 26JUL2019
+# Project name: Potential outcomes in model evaluation
+# Description: This script creates the figures and results used 
+#              in empirical data experiments.
+#
+# Parameters:
+# -----------
+# (1) figure_path : file name for saving the created figures.
+# (2) group_amount : How many groups if Jung-inspired model is used.
+# (3) stan_code_file_name : Name of file containing the stan model code.
+# (4) sigma_tau : Values of prior variance for the Jung-inspired model.
+# (5) data_path : File of compas data.
+
+"""
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import scipy.special as ssp
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+import pystan
+
+plt.switch_backend('agg')
+
+import sys
+
+# figure storage name
+figure_path = sys.argv[1]
+
+# How many groups if jung model is used
+group_amount = int(sys.argv[2])
+
+# Name of stan model code file
+stan_code_file_name = sys.argv[3]
+
+# Variance prior
+sigma_tau = float(sys.argv[4])
+
+# figure storage name
+data_path = sys.argv[5]
+
+# Prefix for the figures and log files
+
+print("These results have been obtained with the following settings:")
+
+print("Number of groups:", group_amount)
+
+print("Prior for the variances:", sigma_tau)
+
+save_name = "sl_compas"
+
+def inv_logit(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def logit(x):
+    return np.log(x) - np.log(1.0 - x)
+
+
+def inverse_cumulative(x, mu, sigma):
+    '''Compute the inverse of the cumulative distribution of logit-normal
+    distribution at x with parameters mu and sigma (mean and st.dev.).'''
+
+    return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
+
+def standardError(p, n):
+    denominator = p * (1 - p)
+    
+    return np.sqrt(denominator / n)
+
+##########################
+
+# ## Evaluator modules
+
+# ### Convenience functions
+
+def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
+    '''
+    Fit a predictive model (default logistic regression) with given training 
+    instances and return probabilities for test instances to obtain a given 
+    class label.
+    
+    Arguments:
+    ----------
+    
+    x_train -- x values of training instances
+    y_train -- y values of training instances
+    x_test -- x values of test instances
+    class_value -- class label for which the probabilities are counted for.
+    model_type -- type of model to be fitted.
+    
+    Returns:
+    --------
+    (1) Trained predictive model
+    (2) Probabilities for given test inputs for given class.
+    '''
+
+    if model_type is None or model_type in ["logistic_regression", "lr"]:
+        # Instantiate the model (using the default parameters)
+        logreg = LogisticRegression(solver='lbfgs')
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            logreg = logreg.fit(x_train, y_train)
+
+        label_probs_logreg = getProbabilityForClass(x_test, logreg,
+                                                    class_value)
+
+        return logreg, label_probs_logreg
+
+    elif model_type in ["random_forest", "rf"]:
+        # Instantiate the model
+        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            forest = forest.fit(x_train, y_train)
+
+        label_probs_forest = getProbabilityForClass(x_test, forest,
+                                                    class_value)
+
+        return forest, label_probs_forest
+
+    elif model_type == "fully_random":
+
+        label_probs = np.ones_like(x_test) / 2
+
+        model_object = lambda x: 0.5
+
+        return model_object, label_probs
+    else:
+        raise ValueError("Invalid model_type!", model_type)
+
+
+def getProbabilityForClass(x, model, class_value):
+    '''
+    Function (wrapper) for obtaining the probability of a class given x and a 
+    predictive model.
+
+    Arguments:
+    -----------
+    x -- individual features, an array of shape (observations, features)
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    Returns:
+    --------
+    (1) The probabilities of given class label for each x.
+    '''
+    if x.ndim == 1:
+        # if x is vector, transform to column matrix.
+        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
+    else:
+        f_values = model.predict_proba(x)
+
+    # Get correct column of predicted class, remove extra dimensions and return.
+    return f_values[:, model.classes_ == class_value].flatten()
+
+
+# ### Contraction algorithm
+# 
+# Below is an implementation of Lakkaraju's team's algorithm presented in 
+# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
+# parameters to be passed to the function are presented in the description.
+
+def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
+                accRateR_col, r):
+    '''
+    This is an implementation of the algorithm presented by Lakkaraju
+    et al. in their paper "The Selective Labels Problem: Evaluating 
+    Algorithmic Predictions in the Presence of Unobservables" (2017).
+
+    Arguments:
+    ----------
+    df -- The (Pandas) data frame containing the data, judge decisions,
+        judge IDs, results and probability scores.
+    judgeIDJ_col -- String, the name of the column containing the judges' IDs
+        in df.
+    decisionT_col -- String, the name of the column containing the judges' decisions
+    resultY_col -- String, the name of the column containing the realization
+    modelProbS_col -- String, the name of the column containing the probability
+        scores from the black-box model B.
+    accRateR_col -- String, the name of the column containing the judges' 
+        acceptance rates
+    r -- Float between 0 and 1, the given acceptance rate.
+
+    Returns:
+    --------
+    (1) The estimated failure rate at acceptance rate r.
+    '''
+    # Get ID of the most lenient judge.
+    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
+
+    # Subset. "D_q is the set of all observations judged by q."
+    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
+
+    # All observations of R_q have observed outcome labels.
+    # "R_q is the set of observations in D_q with observed outcome labels."
+    R_q = D_q[D_q[decisionT_col] == 1].copy()
+
+    # Sort observations in R_q in descending order of confidence scores S and
+    # assign to R_sort_q.
+    # "Observations deemed as high risk by B are at the top of this list"
+    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
+
+    number_to_remove = int(
+        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
+
+    # "R_B is the list of observations assigned to t = 1 by B"
+    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
+
+    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
+
+
+# ### Evaluators
+
+def trueEvaluationEvaluator(df, resultY_col, r):
+
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(df.shape[0] * r))
+    
+    failed = df[resultY_col][0:to_release] == 0
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False):
+
+    df_observed = df.loc[df[decisionT_col] == 1, :]
+
+    df_observed = df_observed.sort_values(by='B_prob_0_model',
+                                              inplace=False,
+                                              ascending=True)
+
+    to_release = int(round(df_observed.shape[0] * r))
+    
+    failed = df_observed[resultY_col][0:to_release] == 0
+
+    if adjusted:
+        return np.mean(failed), df.shape[0]
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
+                             accRateR_col, r):
+
+    # Get judges with correct leniency as list
+    is_correct_leniency = df[accRateR_col].round(1) == r
+
+    # No judges with correct leniency
+    if np.sum(is_correct_leniency) == 0:
+        return np.nan, np.nan
+
+    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
+
+    # Released are the people they judged and released, T = 1
+    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
+                  & (df[decisionT_col] == 1)]
+
+    failed = released[resultY_col] == 0
+    
+    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
+    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
+
+###################
+
+# Read in the data
+compas_raw = pd.read_csv(data_path)
+
+# Select columns
+compas = compas_raw[[
+    'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex',
+    'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
+    'two_year_recid', 'c_jail_in', 'c_jail_out'
+]]
+
+# Subset values, see reasons in ProPublica methodology.
+compas = compas.query('days_b_screening_arrest <= 30 and \
+                      days_b_screening_arrest >= -30 and \
+                      is_recid != -1 and \
+                      c_charge_degree != "O"')
+
+# Drop row if score_text is na
+compas = compas[compas.score_text.notnull()]
+
+# Recode recidivism values to fit earlier notation
+# So here result_Y = 1 - is_recid (inverted binary coding).
+compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1)
+
+# Convert string values to dummies, drop first so full rank
+compas_dummy = pd.get_dummies(
+    compas,
+    columns=['c_charge_degree', 'race', 'age_cat', 'sex'],
+    drop_first=True)
+
+compas_dummy.drop(columns=[
+    'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
+    'two_year_recid', 'score_text', 'is_recid'
+],
+    inplace=True)
+
+# Shuffle rows for random judge assignment
+compas_shuffled = compas_dummy.sample(frac=1)
+
+nJudges_M = 9
+
+# Assign judges as evenly as possible
+judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False)
+
+# Assign fixed leniencies from 0.1 to 0.9
+judge_leniency = np.arange(1, 10) / 10
+
+judge_leniency = judge_leniency[judge_ID]
+
+compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID,
+                                         judge_leniency=judge_leniency)
+
+# Sort by judges then probabilities in decreasing order
+# Most dangerous for each judge are at the top.
+compas_shuffled.sort_values(by=["judge_ID", "decile_score"],
+                            ascending=False,
+                            inplace=True)
+
+# Iterate over the data. Subject will be given a negative decision
+# if they are in the top (1-r)*100% of the individuals the judge will judge.
+# I.e. if their within-judge-index is under 1 - acceptance threshold times
+# the number of subjects assigned to each judge they will receive a
+# negative decision.
+compas_shuffled.reset_index(drop=True, inplace=True)
+
+subjects_allocated = compas_shuffled.judge_ID.value_counts()
+
+compas_shuffled['judge_index'] = compas_shuffled.groupby('judge_ID').cumcount()
+
+compas_shuffled['decision_T'] = np.where(
+    compas_shuffled['judge_index'] < (1 - compas_shuffled['judge_leniency']) *
+    subjects_allocated[compas_shuffled['judge_ID']].values, 0, 1)
+
+compas_labeled = compas_shuffled.copy()
+compas_unlabeled = compas_shuffled.copy()
+
+# Hide unobserved
+compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan
+
+# Choose feature_columns
+feature_cols = ~compas_labeled.columns.isin(
+    ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T'])
+
+feature_cols = compas_labeled.columns[feature_cols]
+
+
+####### Draw figures ###########
+
+failure_rates = np.zeros((8, 6))
+error_Ns = np.zeros((8, 6))
+
+# Split data
+train, test_labeled = train_test_split(compas_labeled, test_size=0.5)
+
+# Assign same observations to unlabeled data
+test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values]
+
+
+# Train a logistic regression model
+B_model, predictions = fitPredictiveModel(
+    train.loc[train['decision_T'] == 1, feature_cols],
+    train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0)
+
+test_labeled = test_labeled.assign(B_prob_0_model=predictions)
+test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
+
+test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
+
+# Find observed values
+observed = test_labeled['decision_T'] == 1
+
+# Assign data to the model
+dat = dict(D=10,
+           N_obs=np.sum(observed),
+           N_cens=np.sum(~observed),
+           K=group_amount,
+           sigma_tau=sigma_tau,
+           M=len(set(compas_labeled['judge_ID'])),
+           jj_obs=test_labeled.loc[observed, 'judge_ID']+1,
+           jj_cens=test_labeled.loc[~observed, 'judge_ID']+1,
+           kk_obs=kk_array[observed]+1,
+           kk_cens=kk_array[~observed]+1,
+           dec_obs=test_labeled.loc[observed, 'decision_T'],
+           dec_cens=test_labeled.loc[~observed, 'decision_T'],
+           X_obs=test_labeled.loc[observed, feature_cols.values],
+           X_cens=test_labeled.loc[~observed, feature_cols.values],
+           y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
+
+sm = pystan.StanModel(file=stan_code_file_name)
+
+fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9))
+
+pars = fit.extract()
+
+print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
+
+plt.figure(figsize=(15,30))
+
+fit.plot();
+
+plt.savefig(save_name + '_stan_diagnostic_plot')
+
+plt.show()
+plt.close('all')
+
+# Bayes
+
+# Alusta matriisi, rivillä yksi otos posteriorista
+# sarakkeet havaintoja
+y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
+
+# Täydennetään havaitsemattomat estimoiduilla
+y_imp[:, ~observed] = 1-pars['y_est']
+
+# Täydennetään havaitut havaituilla
+y_imp[:, observed] = 1-test_labeled.loc[observed, 'result_Y']
+
+Rs = np.arange(.1, .9, .1)
+
+to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int)
+
+f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan)
+
+for i in range(len(to_release_list)):
+    
+    failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1)
+    
+    est_failure_rates =  failed / test_labeled.shape[0]
+            
+    failure_rates[i, 5] = np.mean(est_failure_rates)
+    
+    error_Ns[i, 5] = test_labeled.shape[0]
+
+for r in np.arange(1, 9):
+
+    print(".", end="")
+
+    # True evaluation
+
+    FR, N = trueEvaluationEvaluator(test_unlabeled, 'result_Y', r / 10)
+    
+    failure_rates[r - 1, 0] = FR
+    error_Ns[r - 1, 0] = N
+
+    # Labeled outcomes only
+
+    FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10)
+
+    failure_rates[r - 1, 1] = FR
+    error_Ns[r - 1, 1] = N
+    
+    # Adjusted labeled outcomes
+
+    FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10,
+                                     adjusted=True)
+
+    failure_rates[r - 1, 2] = FR
+    error_Ns[r - 1, 2] = N
+
+    # Human evaluation
+
+    FR, N = humanEvaluationEvaluator(test_labeled, 'judge_ID', 'decision_T', 
+                                     'result_Y',  'judge_leniency', 
+                                     r / 10)
+
+    failure_rates[r - 1, 3] = FR
+    error_Ns[r - 1, 3] = N
+
+    # Contraction
+
+    FR, N = contraction(test_labeled, 'judge_ID', 'decision_T', 'result_Y', 
+                        'B_prob_0_model', 'judge_leniency', r / 10)
+
+    failure_rates[r - 1, 4] = FR
+    error_Ns[r - 1, 4] = N
+
+x_ax = np.arange(0.1, 0.9, 0.1)
+
+failure_SEs = standardError(failure_rates, error_Ns)
+
+labels = [
+    'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
+    'Human evaluation', 'Contraction', 'Potential outcomes'
+]
+colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c']
+
+for i in range(failure_rates.shape[1]):
+    plt.errorbar(x_ax,
+                 failure_rates[:, i],
+                 label=labels[i],
+                 c=colours[i],
+                 yerr=failure_SEs[:, i])
+
+plt.title('Failure rate vs. Acceptance rate')
+plt.xlabel('Acceptance rate')
+plt.ylabel('Failure rate')
+plt.legend()
+plt.grid()
+
+plt.savefig(save_name + '_all')
+
+plt.show()
+
+print("\nFailure rates:")
+print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
+
+print("\nMean absolute errors:")
+for i in range(1, failure_rates.shape[1]):
+    print(
+        labels[i].ljust(len(max(labels, key=len))),
+        np.round(
+            np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
\ No newline at end of file
diff --git a/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py b/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py
new file mode 100644
index 0000000..49e23f2
--- /dev/null
+++ b/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py
@@ -0,0 +1,532 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Author: Riku Laine
+# Date: 26JUL2019
+# Project name: Potential outcomes in model evaluation
+# Description: This script creates the figures and results used 
+#              in empirical data experiments.
+#
+# Parameters:
+# -----------
+# (1) figure_path : file name for saving the created figures.
+# (2) group_amount : How many groups if Jung-inspired model is used.
+# (3) stan_code_file_name : Name of file containing the stan model code.
+# (4) sigma_tau : Values of prior variance for the Jung-inspired model.
+# (5) data_path : File of compas data.
+
+"""
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import scipy.special as ssp
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+import pystan
+
+plt.switch_backend('agg')
+
+import sys
+
+# figure storage name
+figure_path = sys.argv[1]
+
+# How many groups if jung model is used
+group_amount = int(sys.argv[2])
+
+# Name of stan model code file
+stan_code_file_name = sys.argv[3]
+
+# Variance prior
+sigma_tau = float(sys.argv[4])
+
+# figure storage name
+data_path = sys.argv[5]
+
+# Prefix for the figures and log files
+
+print("These results have been obtained with the following settings:")
+
+print("Number of groups:", group_amount)
+
+print("Prior for the variances:", sigma_tau)
+
+save_name = "sl_compas"
+
+def inv_logit(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def logit(x):
+    return np.log(x) - np.log(1.0 - x)
+
+
+def inverse_cumulative(x, mu, sigma):
+    '''Compute the inverse of the cumulative distribution of logit-normal
+    distribution at x with parameters mu and sigma (mean and st.dev.).'''
+
+    return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
+
+def standardError(p, n):
+    denominator = p * (1 - p)
+    
+    return np.sqrt(denominator / n)
+
+##########################
+
+# ## Evaluator modules
+
+# ### Convenience functions
+
+def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
+    '''
+    Fit a predictive model (default logistic regression) with given training 
+    instances and return probabilities for test instances to obtain a given 
+    class label.
+    
+    Arguments:
+    ----------
+    
+    x_train -- x values of training instances
+    y_train -- y values of training instances
+    x_test -- x values of test instances
+    class_value -- class label for which the probabilities are counted for.
+    model_type -- type of model to be fitted.
+    
+    Returns:
+    --------
+    (1) Trained predictive model
+    (2) Probabilities for given test inputs for given class.
+    '''
+
+    if model_type is None or model_type in ["logistic_regression", "lr"]:
+        # Instantiate the model (using the default parameters)
+        logreg = LogisticRegression(solver='lbfgs')
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            logreg = logreg.fit(x_train, y_train)
+
+        label_probs_logreg = getProbabilityForClass(x_test, logreg,
+                                                    class_value)
+
+        return logreg, label_probs_logreg
+
+    elif model_type in ["random_forest", "rf"]:
+        # Instantiate the model
+        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            forest = forest.fit(x_train, y_train)
+
+        label_probs_forest = getProbabilityForClass(x_test, forest,
+                                                    class_value)
+
+        return forest, label_probs_forest
+
+    elif model_type == "fully_random":
+
+        label_probs = np.ones_like(x_test) / 2
+
+        model_object = lambda x: 0.5
+
+        return model_object, label_probs
+    else:
+        raise ValueError("Invalid model_type!", model_type)
+
+
+def getProbabilityForClass(x, model, class_value):
+    '''
+    Function (wrapper) for obtaining the probability of a class given x and a 
+    predictive model.
+
+    Arguments:
+    -----------
+    x -- individual features, an array of shape (observations, features)
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    Returns:
+    --------
+    (1) The probabilities of given class label for each x.
+    '''
+    if x.ndim == 1:
+        # if x is vector, transform to column matrix.
+        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
+    else:
+        f_values = model.predict_proba(x)
+
+    # Get correct column of predicted class, remove extra dimensions and return.
+    return f_values[:, model.classes_ == class_value].flatten()
+
+
+# ### Contraction algorithm
+# 
+# Below is an implementation of Lakkaraju's team's algorithm presented in 
+# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
+# parameters to be passed to the function are presented in the description.
+
+def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
+                accRateR_col, r):
+    '''
+    This is an implementation of the algorithm presented by Lakkaraju
+    et al. in their paper "The Selective Labels Problem: Evaluating 
+    Algorithmic Predictions in the Presence of Unobservables" (2017).
+
+    Arguments:
+    ----------
+    df -- The (Pandas) data frame containing the data, judge decisions,
+        judge IDs, results and probability scores.
+    judgeIDJ_col -- String, the name of the column containing the judges' IDs
+        in df.
+    decisionT_col -- String, the name of the column containing the judges' decisions
+    resultY_col -- String, the name of the column containing the realization
+    modelProbS_col -- String, the name of the column containing the probability
+        scores from the black-box model B.
+    accRateR_col -- String, the name of the column containing the judges' 
+        acceptance rates
+    r -- Float between 0 and 1, the given acceptance rate.
+
+    Returns:
+    --------
+    (1) The estimated failure rate at acceptance rate r.
+    '''
+    # Get ID of the most lenient judge.
+    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
+
+    # Subset. "D_q is the set of all observations judged by q."
+    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
+
+    # All observations of R_q have observed outcome labels.
+    # "R_q is the set of observations in D_q with observed outcome labels."
+    R_q = D_q[D_q[decisionT_col] == 1].copy()
+
+    # Sort observations in R_q in descending order of confidence scores S and
+    # assign to R_sort_q.
+    # "Observations deemed as high risk by B are at the top of this list"
+    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
+
+    number_to_remove = int(
+        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
+
+    # "R_B is the list of observations assigned to t = 1 by B"
+    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
+
+    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
+
+
+# ### Evaluators
+
+def trueEvaluationEvaluator(df, resultY_col, r):
+
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(df.shape[0] * r))
+    
+    failed = df[resultY_col][0:to_release] == 0
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False):
+
+    df_observed = df.loc[df[decisionT_col] == 1, :]
+
+    df_observed = df_observed.sort_values(by='B_prob_0_model',
+                                              inplace=False,
+                                              ascending=True)
+
+    to_release = int(round(df_observed.shape[0] * r))
+    
+    failed = df_observed[resultY_col][0:to_release] == 0
+
+    if adjusted:
+        return np.mean(failed), df.shape[0]
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
+                             accRateR_col, r):
+
+    # Get judges with correct leniency as list
+    is_correct_leniency = df[accRateR_col].round(1) == r
+
+    # No judges with correct leniency
+    if np.sum(is_correct_leniency) == 0:
+        return np.nan, np.nan
+
+    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
+
+    # Released are the people they judged and released, T = 1
+    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
+                  & (df[decisionT_col] == 1)]
+
+    failed = released[resultY_col] == 0
+    
+    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
+    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
+
+###################
+
+# Read in the data
+compas_raw = pd.read_csv(data_path)
+
+# Select columns
+compas = compas_raw[[
+    'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex',
+    'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
+    'two_year_recid', 'c_jail_in', 'c_jail_out'
+]]
+
+# Subset values, see reasons in ProPublica methodology.
+compas = compas.query('days_b_screening_arrest <= 30 and \
+                      days_b_screening_arrest >= -30 and \
+                      is_recid != -1 and \
+                      c_charge_degree != "O"')
+
+# Drop row if score_text is na
+compas = compas[compas.score_text.notnull()]
+
+# Recode recidivism values to fit earlier notation
+# So here result_Y = 1 - is_recid (inverted binary coding).
+compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1)
+
+# Convert string values to dummies, drop first so full rank
+compas_dummy = pd.get_dummies(
+    compas,
+    columns=['c_charge_degree', 'race', 'age_cat', 'sex'],
+    drop_first=True)
+
+compas_dummy.drop(columns=[
+    'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
+    'two_year_recid', 'score_text', 'is_recid'
+],
+    inplace=True)
+
+# Shuffle rows for random judge assignment
+compas_shuffled = compas_dummy.sample(frac=1)
+
+nJudges_M = 9
+
+# Assign judges as evenly as possible
+judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False)
+
+# Assign fixed leniencies from 0.1 to 0.9
+judge_leniency = np.arange(1, 10) / 10
+
+judge_leniency = judge_leniency[judge_ID]
+
+compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID,
+                                         judge_leniency=judge_leniency)
+
+# Sort by judges then probabilities in decreasing order
+# Most dangerous for each judge are at the top.
+compas_shuffled.sort_values(by=["judge_ID", "decile_score"],
+                            ascending=False,
+                            inplace=True)
+
+# Iterate over the data. Subject will be given a negative decision
+# if they are in the top (1-r)*100% of the individuals the judge will judge.
+# I.e. if their within-judge-index is under 1 - acceptance threshold times
+# the number of subjects assigned to each judge they will receive a
+# negative decision.
+compas_shuffled.reset_index(drop=True, inplace=True)
+
+subjects_allocated = compas_shuffled.judge_ID.value_counts()
+
+compas_shuffled['judge_index'] = compas_shuffled.groupby('judge_ID').cumcount()
+
+compas_shuffled['decision_T'] = np.where(
+    compas_shuffled['judge_index'] < (1 - compas_shuffled['judge_leniency']) *
+    subjects_allocated[compas_shuffled['judge_ID']].values, 0, 1)
+
+compas_labeled = compas_shuffled.copy()
+compas_unlabeled = compas_shuffled.copy()
+
+# Hide unobserved
+compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan
+
+# Choose feature_columns
+feature_cols = ~compas_labeled.columns.isin(
+    ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T'])
+
+feature_cols = compas_labeled.columns[feature_cols]
+
+
+####### Draw figures ###########
+
+failure_rates = np.zeros((8, 6))
+error_Ns = np.zeros((8, 6))
+
+# Split data
+train, test_labeled = train_test_split(compas_labeled, test_size=0.5)
+
+# Assign same observations to unlabeled data
+test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values]
+
+
+# Train a logistic regression model
+B_model, predictions = fitPredictiveModel(
+    train.loc[train['decision_T'] == 1, feature_cols],
+    train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0)
+
+test_labeled = test_labeled.assign(B_prob_0_model=predictions)
+test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
+
+test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
+
+# Find observed values
+observed = test_labeled['decision_T'] == 1
+
+# Assign data to the model
+dat = dict(D=1,
+           N_obs=np.sum(observed),
+           N_cens=np.sum(~observed),
+           K=group_amount,
+           sigma_tau=sigma_tau,
+           M=len(set(compas_labeled['judge_ID'])),
+           jj_obs=test_labeled.loc[observed, 'judge_ID']+1,
+           jj_cens=test_labeled.loc[~observed, 'judge_ID']+1,
+           kk_obs=kk_array[observed]+1,
+           kk_cens=kk_array[~observed]+1,
+           dec_obs=test_labeled.loc[observed, 'decision_T'],
+           dec_cens=test_labeled.loc[~observed, 'decision_T'],
+           X_obs=test_labeled.loc[observed, 'B_prob_0_model'].values.reshape(-1, 1),
+           X_cens=test_labeled.loc[~observed, 'B_prob_0_model'].values.reshape(-1, 1),
+           y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
+
+sm = pystan.StanModel(file=stan_code_file_name)
+
+fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9))
+
+pars = fit.extract()
+
+print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
+
+plt.figure(figsize=(15,30))
+
+fit.plot();
+
+plt.savefig(save_name + '_stan_diagnostic_plot')
+
+plt.show()
+plt.close('all')
+
+# Bayes
+
+# Alusta matriisi, rivillä yksi otos posteriorista
+# sarakkeet havaintoja
+y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
+
+# Täydennetään havaitsemattomat estimoiduilla
+y_imp[:, ~observed] = 1-pars['y_est']
+
+# Täydennetään havaitut havaituilla
+y_imp[:, observed] = 1-test_labeled.loc[observed, 'result_Y']
+
+Rs = np.arange(.1, .9, .1)
+
+to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int)
+
+f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan)
+
+for i in range(len(to_release_list)):
+    
+    failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1)
+    
+    est_failure_rates =  failed / test_labeled.shape[0]
+            
+    failure_rates[i, 5] = np.mean(est_failure_rates)
+    
+    error_Ns[i, 5] = test_labeled.shape[0]
+
+for r in np.arange(1, 9):
+
+    print(".", end="")
+
+    # True evaluation
+
+    FR, N = trueEvaluationEvaluator(test_unlabeled, 'result_Y', r / 10)
+    
+    failure_rates[r - 1, 0] = FR
+    error_Ns[r - 1, 0] = N
+
+    # Labeled outcomes only
+
+    FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10)
+
+    failure_rates[r - 1, 1] = FR
+    error_Ns[r - 1, 1] = N
+    
+    # Adjusted labeled outcomes
+
+    FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10,
+                                     adjusted=True)
+
+    failure_rates[r - 1, 2] = FR
+    error_Ns[r - 1, 2] = N
+
+    # Human evaluation
+
+    FR, N = humanEvaluationEvaluator(test_labeled, 'judge_ID', 'decision_T', 
+                                     'result_Y',  'judge_leniency', 
+                                     r / 10)
+
+    failure_rates[r - 1, 3] = FR
+    error_Ns[r - 1, 3] = N
+
+    # Contraction
+
+    FR, N = contraction(test_labeled, 'judge_ID', 'decision_T', 'result_Y', 
+                        'B_prob_0_model', 'judge_leniency', r / 10)
+
+    failure_rates[r - 1, 4] = FR
+    error_Ns[r - 1, 4] = N
+
+x_ax = np.arange(0.1, 0.9, 0.1)
+
+failure_SEs = standardError(failure_rates, error_Ns)
+
+labels = [
+    'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
+    'Human evaluation', 'Contraction', 'Potential outcomes'
+]
+colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c']
+
+for i in range(failure_rates.shape[1]):
+    plt.errorbar(x_ax,
+                 failure_rates[:, i],
+                 label=labels[i],
+                 c=colours[i],
+                 yerr=failure_SEs[:, i])
+
+plt.title('Failure rate vs. Acceptance rate')
+plt.xlabel('Acceptance rate')
+plt.ylabel('Failure rate')
+plt.legend()
+plt.grid()
+
+plt.savefig(save_name + '_all')
+
+plt.show()
+
+print("\nFailure rates:")
+print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
+
+print("\nMean absolute errors:")
+for i in range(1, failure_rates.shape[1]):
+    print(
+        labels[i].ljust(len(max(labels, key=len))),
+        np.round(
+            np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
\ No newline at end of file
diff --git a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py b/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py
new file mode 100644
index 0000000..4ea26d0
--- /dev/null
+++ b/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py
@@ -0,0 +1,1101 @@
+'''
+# Author: Riku Laine
+# Date: 25JUL2019 (start)
+# Project name: Potential outcomes in model evaluation
+# Description: This script creates the figures and results used 
+#              in synthetic data experiments.
+#
+# Parameters:
+# -----------
+# (1) figure_path : file name for saving the created figures.
+# (2) N_sim : Size of simulated data set.
+# (3) M_sim : Number of judges in simulated data, 
+#             N_sim must be divisible by M_sim!
+# (4) which : Which data + outcome analysis should be performed.
+# (5) group_amount : How many groups if Jung-inspired model is used.
+# (6) stan_code_file_name : Name of file containing the stan model code.
+# (7) sigma_tau : Values of prior variance for the Jung-inspired model.
+# (8) model_type : What model type to be fitted. Options:
+#                   - "lr" : logistic regression
+#                   - "rf" : random forest
+#                   - "fully_random" : Fully random, all predictions will be 0.5.
+#
+'''
+# Refer to the `notes.tex` file for explanations about the modular framework.
+
+# Imports
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import scipy.stats as scs
+import scipy.special as ssp
+import scipy.integrate as si
+import numpy.random as npr
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+import pystan
+import gc
+
+plt.switch_backend('agg')
+
+import sys
+
+# figure storage name
+figure_path = sys.argv[1]
+
+# Size of simulated data set
+N_sim = int(sys.argv[2])
+
+# Number of judges in simulated data, N_sim must be divisible by M_sim!
+M_sim = int(sys.argv[3])
+
+# Which data + outcome generation should be performed.
+which = int(sys.argv[4])
+
+# How many groups if jung model is used
+group_amount = int(sys.argv[5])
+
+# Name of stan model code file
+stan_code_file_name = sys.argv[6]
+
+# Variance prior
+sigma_tau = float(sys.argv[7])
+
+# Type of model to be fitted
+model_type = sys.argv[8]
+
+# Settings
+
+plt.rcParams.update({'font.size': 16})
+plt.rcParams.update({'figure.figsize': (10, 6)})
+
+print("These results have been obtained with the following settings:")
+
+print("Number of observations in the simulated data:", N_sim)
+
+print("Number of judges in the simulated data:", M_sim)
+
+print("Number of groups:", group_amount)
+
+print("Prior for the variances:", sigma_tau)
+
+# Basic functions
+
+
+def inverseLogit(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def logit(x):
+    return np.log(x) - np.log(1.0 - x)
+
+
+def inverseCumulative(x, mu, sigma):
+    '''Compute the inverse of the cumulative distribution of logit-normal
+    distribution at x with parameters mu and sigma (mean and st.dev.).'''
+
+    return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
+
+def standardError(p, n):
+    denominator = p * (1 - p)
+    
+    return np.sqrt(denominator / n)
+
+
+# ## Data generation modules
+
+def bernoulliDGWithoutUnobservables(N_total=50000):
+    '''Generates data | Variables: X, Y | Outcome from Bernoulli'''
+
+    df = pd.DataFrame()
+
+    # Sample feature X from standard Gaussian distribution, N(0, 1).
+    df = df.assign(X=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
+    df = df.assign(probabilities_Y=inverseLogit(df.X))
+
+    # Draw Y ~ Bernoulli(1 - inverseLogit(X))
+    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X)
+    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
+
+    df = df.assign(result_Y=results)
+
+    return df
+
+
+def thresholdDGWithUnobservables(N_total=50000,
+                                 beta_X=1.0,
+                                 beta_Z=1.0,
+                                 beta_W=0.2):
+    '''Generates data | Variables: X, Z, W, Y | Outcome by threshold'''
+
+    df = pd.DataFrame()
+
+    # Sample the variables from standard Gaussian distributions.
+    df = df.assign(X=npr.normal(size=N_total))
+    df = df.assign(Z=npr.normal(size=N_total))
+    df = df.assign(W=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X, Z, W)
+    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
+
+    df = df.assign(probabilities_Y=probabilities_Y)
+
+    # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise
+    df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1))
+
+    return df
+
+
+def bernoulliDGWithUnobservables(N_total=50000,
+                                beta_X=1.0,
+                                beta_Z=1.0,
+                                beta_W=0.2):
+    '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli'''
+    
+    df = pd.DataFrame()
+
+    # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1).
+    df = df.assign(X=npr.normal(size=N_total))
+    df = df.assign(Z=npr.normal(size=N_total))
+    df = df.assign(W=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
+    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
+
+    df = df.assign(probabilities_Y=probabilities_Y)
+
+    # Draw Y from Bernoulli distribution
+    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
+
+    df = df.assign(result_Y=results)
+
+    return df
+
+
+# ## Decider modules
+
+def humanDeciderLakkaraju(df,
+                          featureX_col,
+                          featureZ_col=None,
+                          nJudges_M=100,
+                          beta_X=1,
+                          beta_Z=1,
+                          add_epsilon=True):
+    '''Decider module | Non-independent batch decisions.'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    # Sample acceptance rates uniformly from a closed interval
+    # from 0.1 to 0.9 and round to tenth decimal place.
+    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
+    #            values.
+    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+    acceptance_rates = np.round(acceptance_rates, 10)
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
+    else:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Sort by judges then probabilities in decreasing order
+    # Most dangerous for each judge are at the top.
+    df.sort_values(by=["judgeID_J", "probabilities_T"],
+                   ascending=False,
+                   inplace=True)
+
+    # Iterate over the data. Subject will be given a negative decision
+    # if they are in the top (1-r)*100% of the individuals the judge will judge.
+    # I.e. if their within-judge-index is under 1 - acceptance threshold times
+    # the number of subjects assigned to each judge they will receive a
+    # negative decision.
+    df.reset_index(drop=True, inplace=True)
+
+    df['decision_T'] = np.where((df.index.values % nSubjects_N) <
+                                ((1 - df['acceptanceRate_R']) * nSubjects_N),
+                                0, 1)
+
+    df_labeled = df.copy()
+
+    # Hide unobserved
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def bernoulliDecider(df,
+                    featureX_col,
+                    featureZ_col=None,
+                    nJudges_M=100,
+                    beta_X=1,
+                    beta_Z=1,
+                    add_epsilon=True):
+    '''Use X and Z to make a decision with probability 
+    P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
+    else:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Draw T from Bernoulli distribution
+    decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0])
+
+    df = df.assign(decision_T=decisions)
+
+    # Calculate the acceptance rates.
+    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def quantileDecider(df,
+                    featureX_col,
+                    featureZ_col=None,
+                    nJudges_M=100,
+                    beta_X=1,
+                    beta_Z=1,
+                    add_epsilon=True):
+    '''Assign decisions by the value of inverse cumulative distribution function
+    of the logit-normal distribution at leniency r.'''
+    
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    # Sample acceptance rates uniformly from a closed interval
+    # from 0.1 to 0.9 and round to tenth decimal place.
+    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
+    #            values.
+    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+    acceptance_rates = np.round(acceptance_rates, 10)
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
+
+        # Compute the bounds straight from the inverse cumulative.
+        # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2.
+        df = df.assign(bounds=inverseCumulative(
+            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2)))
+    else:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+        # Compute the bounds straight from the inverse cumulative.
+        # Assuming X and Z are i.i.d standard Gaussians with variance 1.
+        # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z).
+        df = df.assign(bounds=inverseCumulative(
+            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2)))
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Assign negative decision if the predicted probability (probabilities_T) is
+    # over the judge's threshold (bounds).
+    df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
+    '''Doesn't use any information about X and Z to make decisions.
+    
+    If use_acceptance_rates is False (default) then all decisions are positive
+    with probability 0.5. If True, probabilities will be sampled from 
+    U(0.1, 0.9) and rounded to tenth decimal place.'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    if use_acceptance_rates:
+        # Sample acceptance rates uniformly from a closed interval
+        # from 0.1 to 0.9 and round to tenth decimal place.
+        acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10)
+    else:
+        # No real leniency here -> set to 0.5.
+        acceptance_rates = np.ones(nJudges_M) * 0.5
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    df = df.assign(
+        decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0]))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def biasDecider(df,
+                featureX_col,
+                featureZ_col=None,
+                nJudges_M=100,
+                beta_X=1,
+                beta_Z=1,
+                add_epsilon=True):
+    '''
+    Biased decider: If X > 1, then X <- X * 0.75. People with high X, 
+    get more positive decisions as they should. And if -2 < X -1, then 
+    X <- X + 0.5. People with X in [-2, 1], get less positive decisions 
+    as they should.
+    
+    '''
+
+    # If X > 1, then X <- X * 0.75. People with high X, get more positive
+    # decisions as they should
+    df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] *
+                                     0.75, df[featureX_col]))
+
+    # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less
+    # positive decisions as they should
+    df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1,
+                           df.biased_X + 0.5, df.biased_X)
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Use quantile decider, but judge by the biased X.
+    df_labeled, df = humanDeciderLakkaraju(df,
+                                     featureX_col='biased_X',
+                                     featureZ_col=featureZ_col,
+                                     nJudges_M=nJudges_M,
+                                     beta_X=beta_X,
+                                     beta_Z=beta_Z,
+                                     add_epsilon=add_epsilon)
+
+    return df_labeled, df
+
+
+# ## Evaluator modules
+
+# ### Convenience functions
+
+def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
+    '''
+    Fit a predictive model (default logistic regression) with given training 
+    instances and return probabilities for test instances to obtain a given 
+    class label.
+    
+    Arguments:
+    ----------
+    
+    x_train -- x values of training instances
+    y_train -- y values of training instances
+    x_test -- x values of test instances
+    class_value -- class label for which the probabilities are counted for.
+    model_type -- type of model to be fitted.
+    
+    Returns:
+    --------
+    (1) Trained predictive model
+    (2) Probabilities for given test inputs for given class.
+    '''
+
+    if model_type is None or model_type in ["logistic_regression", "lr"]:
+        # Instantiate the model (using the default parameters)
+        logreg = LogisticRegression(solver='lbfgs')
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            logreg = logreg.fit(x_train, y_train)
+
+        label_probs_logreg = getProbabilityForClass(x_test, logreg,
+                                                    class_value)
+
+        return logreg, label_probs_logreg
+
+    elif model_type in ["random_forest", "rf"]:
+        # Instantiate the model
+        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            forest = forest.fit(x_train, y_train)
+
+        label_probs_forest = getProbabilityForClass(x_test, forest,
+                                                    class_value)
+
+        return forest, label_probs_forest
+
+    elif model_type == "fully_random":
+
+        label_probs = np.ones_like(x_test) / 2
+
+        model_object = lambda x: 0.5
+
+        return model_object, label_probs
+    else:
+        raise ValueError("Invalid model_type!", model_type)
+
+
+def getProbabilityForClass(x, model, class_value):
+    '''
+    Function (wrapper) for obtaining the probability of a class given x and a 
+    predictive model.
+
+    Arguments:
+    -----------
+    x -- individual features, an array of shape (observations, features)
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    Returns:
+    --------
+    (1) The probabilities of given class label for each x.
+    '''
+    if x.ndim == 1:
+        # if x is vector, transform to column matrix.
+        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
+    else:
+        f_values = model.predict_proba(x)
+
+    # Get correct column of predicted class, remove extra dimensions and return.
+    return f_values[:, model.classes_ == class_value].flatten()
+
+
+def cdf(x_0, model, class_value):
+    '''
+    Cumulative distribution function as described above. Integral is 
+    approximated using Simpson's rule for efficiency.
+    
+    Arguments:
+    ----------
+    
+    x_0 -- private features of an instance for which the value of cdf is to be
+        calculated.
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    '''
+
+    def prediction(x):
+        return getProbabilityForClass(
+            np.array([x]).reshape(-1, 1), model, class_value)
+
+    prediction_x_0 = prediction(x_0)
+
+    x_values = np.linspace(-15, 15, 40000)
+
+    x_preds = prediction(x_values)
+
+    y_values = scs.norm.pdf(x_values)
+
+    results = np.zeros(x_0.shape[0])
+
+    for i in range(x_0.shape[0]):
+
+        y_copy = y_values.copy()
+
+        y_copy[x_preds > prediction_x_0[i]] = 0
+
+        results[i] = si.simps(y_copy, x=x_values)
+
+    return results
+
+# ### Contraction algorithm
+# 
+# Below is an implementation of Lakkaraju's team's algorithm presented in 
+# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
+# parameters to be passed to the function are presented in the description.
+
+def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
+                accRateR_col, r):
+    '''
+    This is an implementation of the algorithm presented by Lakkaraju
+    et al. in their paper "The Selective Labels Problem: Evaluating 
+    Algorithmic Predictions in the Presence of Unobservables" (2017).
+
+    Arguments:
+    ----------
+    df -- The (Pandas) data frame containing the data, judge decisions,
+        judge IDs, results and probability scores.
+    judgeIDJ_col -- String, the name of the column containing the judges' IDs
+        in df.
+    decisionT_col -- String, the name of the column containing the judges' decisions
+    resultY_col -- String, the name of the column containing the realization
+    modelProbS_col -- String, the name of the column containing the probability
+        scores from the black-box model B.
+    accRateR_col -- String, the name of the column containing the judges' 
+        acceptance rates
+    r -- Float between 0 and 1, the given acceptance rate.
+
+    Returns:
+    --------
+    (1) The estimated failure rate at acceptance rate r.
+    '''
+    # Get ID of the most lenient judge.
+    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
+
+    # Subset. "D_q is the set of all observations judged by q."
+    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
+
+    # All observations of R_q have observed outcome labels.
+    # "R_q is the set of observations in D_q with observed outcome labels."
+    R_q = D_q[D_q[decisionT_col] == 1].copy()
+
+    # Sort observations in R_q in descending order of confidence scores S and
+    # assign to R_sort_q.
+    # "Observations deemed as high risk by B are at the top of this list"
+    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
+
+    number_to_remove = int(
+        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
+
+    # "R_B is the list of observations assigned to t = 1 by B"
+    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
+
+    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
+
+
+# ### Evaluators
+
+def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r):
+
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(df.shape[0] * r))
+    
+    failed = df[resultY_col][0:to_release] == 0
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+def labeledOutcomesEvaluator(df,
+                             featureX_col,
+                             decisionT_col,
+                             resultY_col,
+                             r,
+                             adjusted=False):
+
+    df_observed = df.loc[df[decisionT_col] == 1, :]
+
+    df_observed = df_observed.sort_values(by='B_prob_0_model',
+                                              inplace=False,
+                                              ascending=True)
+
+    to_release = int(round(df_observed.shape[0] * r))
+    
+    failed = df_observed[resultY_col][0:to_release] == 0
+
+    if adjusted:
+        return np.mean(failed), df.shape[0]
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
+                             accRateR_col, r):
+
+    # Get judges with correct leniency as list
+    is_correct_leniency = df[accRateR_col].round(1) == r
+
+    # No judges with correct leniency
+    if np.sum(is_correct_leniency) == 0:
+        return np.nan, np.nan
+
+    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
+
+    # Released are the people they judged and released, T = 1
+    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
+                  & (df[decisionT_col] == 1)]
+
+    failed = released[resultY_col] == 0
+    
+    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
+    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
+
+
+def monteCarloEvaluator(df,
+                        featureX_col,
+                        decisionT_col,
+                        resultY_col,
+                        accRateR_col,
+                        r,
+                        mu_X=0,
+                        mu_Z=0,
+                        beta_X=1,
+                        beta_Z=1,
+                        sigma_X=1,
+                        sigma_Z=1):
+
+    # Compute the predicted/assumed decision bounds for all the judges.
+    q_r = inverseCumulative(x=df[accRateR_col],
+                             mu=mu_X + mu_Z,
+                             sigma=np.sqrt((beta_X * sigma_X)**2 +
+                                           (beta_Z * sigma_Z)**2))
+
+    df = df.assign(bounds=logit(q_r) - df[featureX_col])
+
+    # Compute the expectation of Z when it is known to come from truncated
+    # Gaussian.
+    alphabeta = (df.bounds - mu_Z) / (sigma_Z)
+
+    Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z)  # 1 - cdf(ab)
+
+    # E(Z | Z > a). Expectation of Z if negative decision.
+    exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_
+
+    # E(Z | Z < b). Expectation of Z if positive decision.
+    exp_upper_trunc = mu_Z - (
+        sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta)
+
+    exp_Z = (1 - df[decisionT_col]
+             ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc
+
+    # Attach the predicted probability for Y=0 to data.
+    df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z))
+
+    # Predictions drawn from binomial.
+    predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0])
+
+    df[resultY_col] = np.where(df[decisionT_col] == 0, predictions,
+                                 df[resultY_col])
+
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(df.shape[0] * r))
+    
+    failed = df[resultY_col][0:to_release] == 0
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+
+def perfComp(dgModule, deciderModule, title, save_name):
+    failure_rates = np.zeros((8, 7))
+    error_Ns = np.zeros((8, 7))
+
+    # Create data
+    df = dgModule()
+
+    # Decicions
+    df_labeled, df_unlabeled = deciderModule(df)
+
+    # Split data
+    train, test_labeled = train_test_split(df_labeled, test_size=0.5)
+    
+    # Assign same observations to unlabeled dat
+    test_unlabeled = df_unlabeled.iloc[test_labeled.index.values]
+
+    # Train model
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train['decision_T'] == 1, 'X'],
+        train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], 0, model_type=model_type)
+
+    # Attach predictions to data
+    test_labeled = test_labeled.assign(B_prob_0_model=predictions)
+    test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
+
+    test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
+
+    # Find observed values
+    observed = test_labeled['decision_T'] == 1
+
+    # Assign data to the model
+    dat = dict(D=1,
+               N_obs=np.sum(observed),
+               N_cens=np.sum(~observed),
+               K=group_amount,
+               sigma_tau=sigma_tau,
+               M=len(set(df_unlabeled['judgeID_J'])),
+               jj_obs=test_labeled.loc[observed, 'judgeID_J']+1,
+               jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1,
+               kk_obs=kk_array[observed]+1,
+               kk_cens=kk_array[~observed]+1,
+               dec_obs=test_labeled.loc[observed, 'decision_T'],
+               dec_cens=test_labeled.loc[~observed, 'decision_T'],
+               X_obs=test_labeled.loc[observed, 'X'].values.reshape(-1,1),
+               X_cens=test_labeled.loc[~observed, 'X'].values.reshape(-1,1),
+               y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
+
+    fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9))
+
+    pars = fit.extract()
+
+    plt.figure(figsize=(15,30))
+
+    fit.plot();
+
+    plt.savefig(save_name + '_stan_diagnostic_plot')
+    
+    plt.show()
+    plt.close('all')
+
+    print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
+
+    # Bayes
+    
+    # Alusta matriisi, rivillä yksi otos posteriorista
+    # sarakkeet havaintoja
+    y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
+    
+    # Täydennetään havaitsemattomat estimoiduilla
+    y_imp[:, ~observed] = 1 - pars['y_est']
+    
+    # Täydennetään havaitut havaituilla
+    y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y']
+
+    Rs = np.arange(.1, .9, .1)
+    
+    to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int)
+        
+    for i in range(len(to_release_list)):
+        
+        failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1)
+        
+        est_failure_rates =  failed / test_labeled.shape[0]
+                
+        failure_rates[i, 6] = np.mean(est_failure_rates)
+        
+        error_Ns[i, 6] = test_labeled.shape[0]
+                    
+    for r in range(1, 9):
+
+        print(".", end="")
+
+        # True evaluation
+
+        FR, N = trueEvaluationEvaluator(test_unlabeled, 'X', 'decision_T', 
+                                        'result_Y', r / 10)
+        
+        failure_rates[r - 1, 0] = FR
+        error_Ns[r - 1, 0] = N
+
+        # Labeled outcomes only
+
+        FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T', 
+                                         'result_Y', r / 10)
+
+        failure_rates[r - 1, 1] = FR
+        error_Ns[r - 1, 1] = N
+
+        # Adjusted labeled outcomes
+
+        FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T',
+                                         'result_Y', r / 10, adjusted=True)
+
+        failure_rates[r - 1, 2] = FR
+        error_Ns[r - 1, 2] = N
+
+        # Human evaluation
+
+        FR, N = humanEvaluationEvaluator(test_labeled, 'judgeID_J', 
+                                         'decision_T', 'result_Y', 
+                                         'acceptanceRate_R', r / 10)
+
+        failure_rates[r - 1, 3] = FR
+        error_Ns[r - 1, 3] = N
+
+        # Contraction
+
+        FR, N = contraction(test_labeled, 'judgeID_J', 'decision_T',  
+                            'result_Y', 'B_prob_0_model', 'acceptanceRate_R', r / 10)
+
+        failure_rates[r - 1, 4] = FR
+        error_Ns[r - 1, 4] = N
+
+        # Causal model - analytic solution
+
+        FR, N = monteCarloEvaluator(test_labeled, 'X', 'decision_T', 
+                                    'result_Y', 'acceptanceRate_R', r / 10)
+
+        failure_rates[r - 1, 5] = FR
+        error_Ns[r - 1, 5] = N
+        
+    failure_SEs = standardError(failure_rates, error_Ns)
+
+    x_ax = np.arange(0.1, 0.9, 0.1)
+
+    labels = [
+        'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
+        'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes'
+    ]
+    colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c']
+
+    for i in range(failure_rates.shape[1]):
+        plt.errorbar(x_ax,
+                     failure_rates[:, i],
+                     label=labels[i],
+                     c=colours[i],
+                     yerr=failure_SEs[:, i])
+
+    plt.title('Failure rate vs. Acceptance rate')
+    plt.xlabel('Acceptance rate')
+    plt.ylabel('Failure rate')
+    plt.legend()
+    plt.grid()
+    
+    plt.savefig(save_name + '_all')
+    
+    plt.show()
+
+    print("\nFailure rates:")
+    print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
+    
+    print("\nMean absolute errors:")
+    for i in range(1, failure_rates.shape[1]):
+        print(
+            labels[i].ljust(len(max(labels, key=len))),
+            np.round(
+                np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
+
+
+sm = pystan.StanModel(file=stan_code_file_name)
+
+if which == 1:
+    print("\nWithout unobservables (Bernoulli + independent decisions)")
+
+    dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z"
+    )
+
+gc.collect()
+plt.close('all')
+
+print("\nWith unobservables in the data")
+
+if which == 2:
+    print("\nBernoulli + independent decisions")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Bernoulli + independent decisions, with unobservables",
+        figure_path + "sl_bernoulli_independent_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 3:
+    print("\nThreshold rule + independent decisions")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Threshold rule + independent decisions, with unobservables",
+        figure_path + "sl_threshold_independent_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 4:
+    print("\nBernoulli + non-independent (batch) decisions")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Bernoulli + non-independent decisions, with unobservables",
+        figure_path + "sl_bernoulli_batch_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 5:
+    print("\nThreshold rule + non-independent (batch) decisions")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Threshold rule + non-independent decisions, with unobservables",
+        figure_path + "sl_threshold_batch_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 6:
+    print("\nRandom decider")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: randomDecider(
+        x, nJudges_M=M_sim, use_acceptance_rates=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + random decider with leniency and unobservables",
+        figure_path + "sl_random_decider_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 7:
+    print("\nBiased decider")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + biased decider with leniency and unobservables",
+        figure_path + "sl_biased_decider_with_Z",
+    )
+
+
+if which == 8:
+    print("\nBad judge")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + 'bad' decider with leniency and unobservables",
+        figure_path + "sl_bad_decider_with_Z"
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 9:
+    print("\nBernoulli + Bernoulli")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + Bernoulli",
+        figure_path + "sl_bernoulli_bernoulli_with_Z",
+    )
+    
+if which == 10:
+    print("\nBeta_Z = 3, Threshold + batch")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Beta_Z = 3, threshold + batch",
+        figure_path + "sl_threshold_batch_beta_Z_3_with_Z",
+    )
+    
+if which == 11:
+    print("\nBeta_Z = 5, Threshold + batch")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Beta_Z = 5, threshold + batch",
+        figure_path + "sl_threshold_batch_beta_Z_5_with_Z",
+    )
\ No newline at end of file
diff --git a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py b/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py
new file mode 100644
index 0000000..00711de
--- /dev/null
+++ b/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py
@@ -0,0 +1,1101 @@
+'''
+# Author: Riku Laine
+# Date: 25JUL2019 (start)
+# Project name: Potential outcomes in model evaluation
+# Description: This script creates the figures and results used 
+#              in synthetic data experiments.
+#
+# Parameters:
+# -----------
+# (1) figure_path : file name for saving the created figures.
+# (2) N_sim : Size of simulated data set.
+# (3) M_sim : Number of judges in simulated data, 
+#             N_sim must be divisible by M_sim!
+# (4) which : Which data + outcome analysis should be performed.
+# (5) group_amount : How many groups if Jung-inspired model is used.
+# (6) stan_code_file_name : Name of file containing the stan model code.
+# (7) sigma_tau : Values of prior variance for the Jung-inspired model.
+# (8) model_type : What model type to be fitted. Options:
+#                   - "lr" : logistic regression
+#                   - "rf" : random forest
+#                   - "fully_random" : Fully random, all predictions will be 0.5.
+#
+'''
+# Refer to the `notes.tex` file for explanations about the modular framework.
+
+# Imports
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import scipy.stats as scs
+import scipy.special as ssp
+import scipy.integrate as si
+import numpy.random as npr
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+import pystan
+import gc
+
+plt.switch_backend('agg')
+
+import sys
+
+# figure storage name
+figure_path = sys.argv[1]
+
+# Size of simulated data set
+N_sim = int(sys.argv[2])
+
+# Number of judges in simulated data, N_sim must be divisible by M_sim!
+M_sim = int(sys.argv[3])
+
+# Which data + outcome generation should be performed.
+which = int(sys.argv[4])
+
+# How many groups if jung model is used
+group_amount = int(sys.argv[5])
+
+# Name of stan model code file
+stan_code_file_name = sys.argv[6]
+
+# Variance prior
+sigma_tau = float(sys.argv[7])
+
+# Type of model to be fitted
+model_type = sys.argv[8]
+
+# Settings
+
+plt.rcParams.update({'font.size': 16})
+plt.rcParams.update({'figure.figsize': (10, 6)})
+
+print("These results have been obtained with the following settings:")
+
+print("Number of observations in the simulated data:", N_sim)
+
+print("Number of judges in the simulated data:", M_sim)
+
+print("Number of groups:", group_amount)
+
+print("Prior for the variances:", sigma_tau)
+
+# Basic functions
+
+
+def inverseLogit(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def logit(x):
+    return np.log(x) - np.log(1.0 - x)
+
+
+def inverseCumulative(x, mu, sigma):
+    '''Compute the inverse of the cumulative distribution of logit-normal
+    distribution at x with parameters mu and sigma (mean and st.dev.).'''
+
+    return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
+
+def standardError(p, n):
+    denominator = p * (1 - p)
+    
+    return np.sqrt(denominator / n)
+
+
+# ## Data generation modules
+
+def bernoulliDGWithoutUnobservables(N_total=50000):
+    '''Generates data | Variables: X, Y | Outcome from Bernoulli'''
+
+    df = pd.DataFrame()
+
+    # Sample feature X from standard Gaussian distribution, N(0, 1).
+    df = df.assign(X=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
+    df = df.assign(probabilities_Y=inverseLogit(df.X))
+
+    # Draw Y ~ Bernoulli(1 - inverseLogit(X))
+    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X)
+    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
+
+    df = df.assign(result_Y=results)
+
+    return df
+
+
+def thresholdDGWithUnobservables(N_total=50000,
+                                 beta_X=1.0,
+                                 beta_Z=1.0,
+                                 beta_W=0.2):
+    '''Generates data | Variables: X, Z, W, Y | Outcome by threshold'''
+
+    df = pd.DataFrame()
+
+    # Sample the variables from standard Gaussian distributions.
+    df = df.assign(X=npr.normal(size=N_total))
+    df = df.assign(Z=npr.normal(size=N_total))
+    df = df.assign(W=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X, Z, W)
+    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
+
+    df = df.assign(probabilities_Y=probabilities_Y)
+
+    # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise
+    df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1))
+
+    return df
+
+
+def bernoulliDGWithUnobservables(N_total=50000,
+                                beta_X=1.0,
+                                beta_Z=1.0,
+                                beta_W=0.2):
+    '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli'''
+    
+    df = pd.DataFrame()
+
+    # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1).
+    df = df.assign(X=npr.normal(size=N_total))
+    df = df.assign(Z=npr.normal(size=N_total))
+    df = df.assign(W=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
+    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
+
+    df = df.assign(probabilities_Y=probabilities_Y)
+
+    # Draw Y from Bernoulli distribution
+    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
+
+    df = df.assign(result_Y=results)
+
+    return df
+
+
+# ## Decider modules
+
+def humanDeciderLakkaraju(df,
+                          featureX_col,
+                          featureZ_col=None,
+                          nJudges_M=100,
+                          beta_X=1,
+                          beta_Z=1,
+                          add_epsilon=True):
+    '''Decider module | Non-independent batch decisions.'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    # Sample acceptance rates uniformly from a closed interval
+    # from 0.1 to 0.9 and round to tenth decimal place.
+    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
+    #            values.
+    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+    acceptance_rates = np.round(acceptance_rates, 10)
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
+    else:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Sort by judges then probabilities in decreasing order
+    # Most dangerous for each judge are at the top.
+    df.sort_values(by=["judgeID_J", "probabilities_T"],
+                   ascending=False,
+                   inplace=True)
+
+    # Iterate over the data. Subject will be given a negative decision
+    # if they are in the top (1-r)*100% of the individuals the judge will judge.
+    # I.e. if their within-judge-index is under 1 - acceptance threshold times
+    # the number of subjects assigned to each judge they will receive a
+    # negative decision.
+    df.reset_index(drop=True, inplace=True)
+
+    df['decision_T'] = np.where((df.index.values % nSubjects_N) <
+                                ((1 - df['acceptanceRate_R']) * nSubjects_N),
+                                0, 1)
+
+    df_labeled = df.copy()
+
+    # Hide unobserved
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def bernoulliDecider(df,
+                    featureX_col,
+                    featureZ_col=None,
+                    nJudges_M=100,
+                    beta_X=1,
+                    beta_Z=1,
+                    add_epsilon=True):
+    '''Use X and Z to make a decision with probability 
+    P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
+    else:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Draw T from Bernoulli distribution
+    decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0])
+
+    df = df.assign(decision_T=decisions)
+
+    # Calculate the acceptance rates.
+    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def quantileDecider(df,
+                    featureX_col,
+                    featureZ_col=None,
+                    nJudges_M=100,
+                    beta_X=1,
+                    beta_Z=1,
+                    add_epsilon=True):
+    '''Assign decisions by the value of inverse cumulative distribution function
+    of the logit-normal distribution at leniency r.'''
+    
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    # Sample acceptance rates uniformly from a closed interval
+    # from 0.1 to 0.9 and round to tenth decimal place.
+    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
+    #            values.
+    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+    acceptance_rates = np.round(acceptance_rates, 10)
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
+
+        # Compute the bounds straight from the inverse cumulative.
+        # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2.
+        df = df.assign(bounds=inverseCumulative(
+            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2)))
+    else:
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+        # Compute the bounds straight from the inverse cumulative.
+        # Assuming X and Z are i.i.d standard Gaussians with variance 1.
+        # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z).
+        df = df.assign(bounds=inverseCumulative(
+            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2)))
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Assign negative decision if the predicted probability (probabilities_T) is
+    # over the judge's threshold (bounds).
+    df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
+    '''Doesn't use any information about X and Z to make decisions.
+    
+    If use_acceptance_rates is False (default) then all decisions are positive
+    with probability 0.5. If True, probabilities will be sampled from 
+    U(0.1, 0.9) and rounded to tenth decimal place.'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    if use_acceptance_rates:
+        # Sample acceptance rates uniformly from a closed interval
+        # from 0.1 to 0.9 and round to tenth decimal place.
+        acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10)
+    else:
+        # No real leniency here -> set to 0.5.
+        acceptance_rates = np.ones(nJudges_M) * 0.5
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    df = df.assign(
+        decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0]))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def biasDecider(df,
+                featureX_col,
+                featureZ_col=None,
+                nJudges_M=100,
+                beta_X=1,
+                beta_Z=1,
+                add_epsilon=True):
+    '''
+    Biased decider: If X > 1, then X <- X * 0.75. People with high X, 
+    get more positive decisions as they should. And if -2 < X -1, then 
+    X <- X + 0.5. People with X in [-2, 1], get less positive decisions 
+    as they should.
+    
+    '''
+
+    # If X > 1, then X <- X * 0.75. People with high X, get more positive
+    # decisions as they should
+    df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] *
+                                     0.75, df[featureX_col]))
+
+    # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less
+    # positive decisions as they should
+    df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1,
+                           df.biased_X + 0.5, df.biased_X)
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Use quantile decider, but judge by the biased X.
+    df_labeled, df = humanDeciderLakkaraju(df,
+                                     featureX_col='biased_X',
+                                     featureZ_col=featureZ_col,
+                                     nJudges_M=nJudges_M,
+                                     beta_X=beta_X,
+                                     beta_Z=beta_Z,
+                                     add_epsilon=add_epsilon)
+
+    return df_labeled, df
+
+
+# ## Evaluator modules
+
+# ### Convenience functions
+
+def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
+    '''
+    Fit a predictive model (default logistic regression) with given training 
+    instances and return probabilities for test instances to obtain a given 
+    class label.
+    
+    Arguments:
+    ----------
+    
+    x_train -- x values of training instances
+    y_train -- y values of training instances
+    x_test -- x values of test instances
+    class_value -- class label for which the probabilities are counted for.
+    model_type -- type of model to be fitted.
+    
+    Returns:
+    --------
+    (1) Trained predictive model
+    (2) Probabilities for given test inputs for given class.
+    '''
+
+    if model_type is None or model_type in ["logistic_regression", "lr"]:
+        # Instantiate the model (using the default parameters)
+        logreg = LogisticRegression(solver='lbfgs')
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            logreg = logreg.fit(x_train, y_train)
+
+        label_probs_logreg = getProbabilityForClass(x_test, logreg,
+                                                    class_value)
+
+        return logreg, label_probs_logreg
+
+    elif model_type in ["random_forest", "rf"]:
+        # Instantiate the model
+        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            forest = forest.fit(x_train, y_train)
+
+        label_probs_forest = getProbabilityForClass(x_test, forest,
+                                                    class_value)
+
+        return forest, label_probs_forest
+
+    elif model_type == "fully_random":
+
+        label_probs = np.ones_like(x_test) / 2
+
+        model_object = lambda x: 0.5
+
+        return model_object, label_probs
+    else:
+        raise ValueError("Invalid model_type!", model_type)
+
+
+def getProbabilityForClass(x, model, class_value):
+    '''
+    Function (wrapper) for obtaining the probability of a class given x and a 
+    predictive model.
+
+    Arguments:
+    -----------
+    x -- individual features, an array of shape (observations, features)
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    Returns:
+    --------
+    (1) The probabilities of given class label for each x.
+    '''
+    if x.ndim == 1:
+        # if x is vector, transform to column matrix.
+        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
+    else:
+        f_values = model.predict_proba(x)
+
+    # Get correct column of predicted class, remove extra dimensions and return.
+    return f_values[:, model.classes_ == class_value].flatten()
+
+
+def cdf(x_0, model, class_value):
+    '''
+    Cumulative distribution function as described above. Integral is 
+    approximated using Simpson's rule for efficiency.
+    
+    Arguments:
+    ----------
+    
+    x_0 -- private features of an instance for which the value of cdf is to be
+        calculated.
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    '''
+
+    def prediction(x):
+        return getProbabilityForClass(
+            np.array([x]).reshape(-1, 1), model, class_value)
+
+    prediction_x_0 = prediction(x_0)
+
+    x_values = np.linspace(-15, 15, 40000)
+
+    x_preds = prediction(x_values)
+
+    y_values = scs.norm.pdf(x_values)
+
+    results = np.zeros(x_0.shape[0])
+
+    for i in range(x_0.shape[0]):
+
+        y_copy = y_values.copy()
+
+        y_copy[x_preds > prediction_x_0[i]] = 0
+
+        results[i] = si.simps(y_copy, x=x_values)
+
+    return results
+
+# ### Contraction algorithm
+# 
+# Below is an implementation of Lakkaraju's team's algorithm presented in 
+# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
+# parameters to be passed to the function are presented in the description.
+
+def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
+                accRateR_col, r):
+    '''
+    This is an implementation of the algorithm presented by Lakkaraju
+    et al. in their paper "The Selective Labels Problem: Evaluating 
+    Algorithmic Predictions in the Presence of Unobservables" (2017).
+
+    Arguments:
+    ----------
+    df -- The (Pandas) data frame containing the data, judge decisions,
+        judge IDs, results and probability scores.
+    judgeIDJ_col -- String, the name of the column containing the judges' IDs
+        in df.
+    decisionT_col -- String, the name of the column containing the judges' decisions
+    resultY_col -- String, the name of the column containing the realization
+    modelProbS_col -- String, the name of the column containing the probability
+        scores from the black-box model B.
+    accRateR_col -- String, the name of the column containing the judges' 
+        acceptance rates
+    r -- Float between 0 and 1, the given acceptance rate.
+
+    Returns:
+    --------
+    (1) The estimated failure rate at acceptance rate r.
+    '''
+    # Get ID of the most lenient judge.
+    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
+
+    # Subset. "D_q is the set of all observations judged by q."
+    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
+
+    # All observations of R_q have observed outcome labels.
+    # "R_q is the set of observations in D_q with observed outcome labels."
+    R_q = D_q[D_q[decisionT_col] == 1].copy()
+
+    # Sort observations in R_q in descending order of confidence scores S and
+    # assign to R_sort_q.
+    # "Observations deemed as high risk by B are at the top of this list"
+    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
+
+    number_to_remove = int(
+        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
+
+    # "R_B is the list of observations assigned to t = 1 by B"
+    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
+
+    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
+
+
+# ### Evaluators
+
+def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r):
+
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(df.shape[0] * r))
+    
+    failed = df[resultY_col][0:to_release] == 0
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+def labeledOutcomesEvaluator(df,
+                             featureX_col,
+                             decisionT_col,
+                             resultY_col,
+                             r,
+                             adjusted=False):
+
+    df_observed = df.loc[df[decisionT_col] == 1, :]
+
+    df_observed = df_observed.sort_values(by='B_prob_0_model',
+                                              inplace=False,
+                                              ascending=True)
+
+    to_release = int(round(df_observed.shape[0] * r))
+    
+    failed = df_observed[resultY_col][0:to_release] == 0
+
+    if adjusted:
+        return np.mean(failed), df.shape[0]
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
+                             accRateR_col, r):
+
+    # Get judges with correct leniency as list
+    is_correct_leniency = df[accRateR_col].round(1) == r
+
+    # No judges with correct leniency
+    if np.sum(is_correct_leniency) == 0:
+        return np.nan, np.nan
+
+    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
+
+    # Released are the people they judged and released, T = 1
+    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
+                  & (df[decisionT_col] == 1)]
+
+    failed = released[resultY_col] == 0
+    
+    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
+    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
+
+
+def monteCarloEvaluator(df,
+                        featureX_col,
+                        decisionT_col,
+                        resultY_col,
+                        accRateR_col,
+                        r,
+                        mu_X=0,
+                        mu_Z=0,
+                        beta_X=1,
+                        beta_Z=1,
+                        sigma_X=1,
+                        sigma_Z=1):
+
+    # Compute the predicted/assumed decision bounds for all the judges.
+    q_r = inverseCumulative(x=df[accRateR_col],
+                             mu=mu_X + mu_Z,
+                             sigma=np.sqrt((beta_X * sigma_X)**2 +
+                                           (beta_Z * sigma_Z)**2))
+
+    df = df.assign(bounds=logit(q_r) - df[featureX_col])
+
+    # Compute the expectation of Z when it is known to come from truncated
+    # Gaussian.
+    alphabeta = (df.bounds - mu_Z) / (sigma_Z)
+
+    Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z)  # 1 - cdf(ab)
+
+    # E(Z | Z > a). Expectation of Z if negative decision.
+    exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_
+
+    # E(Z | Z < b). Expectation of Z if positive decision.
+    exp_upper_trunc = mu_Z - (
+        sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta)
+
+    exp_Z = (1 - df[decisionT_col]
+             ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc
+
+    # Attach the predicted probability for Y=0 to data.
+    df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z))
+
+    # Predictions drawn from binomial.
+    predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0])
+
+    df[resultY_col] = np.where(df[decisionT_col] == 0, predictions,
+                                 df[resultY_col])
+
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(df.shape[0] * r))
+    
+    failed = df[resultY_col][0:to_release] == 0
+
+    return np.sum(failed) / df.shape[0], df.shape[0]
+
+
+
+def perfComp(dgModule, deciderModule, title, save_name):
+    failure_rates = np.zeros((8, 7))
+    error_Ns = np.zeros((8, 7))
+
+    # Create data
+    df = dgModule()
+
+    # Decicions
+    df_labeled, df_unlabeled = deciderModule(df)
+
+    # Split data
+    train, test_labeled = train_test_split(df_labeled, test_size=0.5)
+    
+    # Assign same observations to unlabeled dat
+    test_unlabeled = df_unlabeled.iloc[test_labeled.index.values]
+
+    # Train model
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train['decision_T'] == 1, 'X'],
+        train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], 0, model_type=model_type)
+
+    # Attach predictions to data
+    test_labeled = test_labeled.assign(B_prob_0_model=predictions)
+    test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
+
+    test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
+
+    # Find observed values
+    observed = test_labeled['decision_T'] == 1
+
+    # Assign data to the model
+    dat = dict(D=1,
+               N_obs=np.sum(observed),
+               N_cens=np.sum(~observed),
+               K=group_amount,
+               sigma_tau=sigma_tau,
+               M=len(set(df_unlabeled['judgeID_J'])),
+               jj_obs=test_labeled.loc[observed, 'judgeID_J']+1,
+               jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1,
+               kk_obs=kk_array[observed]+1,
+               kk_cens=kk_array[~observed]+1,
+               dec_obs=test_labeled.loc[observed, 'decision_T'],
+               dec_cens=test_labeled.loc[~observed, 'decision_T'],
+               X_obs=test_labeled.loc[observed, 'B_prob_0_model'].values.reshape(-1,1),
+               X_cens=test_labeled.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1),
+               y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
+
+    fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9))
+
+    pars = fit.extract()
+
+    plt.figure(figsize=(15,30))
+
+    fit.plot();
+
+    plt.savefig(save_name + '_stan_diagnostic_plot')
+    
+    plt.show()
+    plt.close('all')
+
+    print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
+
+    # Bayes
+    
+    # Alusta matriisi, rivillä yksi otos posteriorista
+    # sarakkeet havaintoja
+    y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
+    
+    # Täydennetään havaitsemattomat estimoiduilla
+    y_imp[:, ~observed] = 1 - pars['y_est']
+    
+    # Täydennetään havaitut havaituilla
+    y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y']
+
+    Rs = np.arange(.1, .9, .1)
+    
+    to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int)
+        
+    for i in range(len(to_release_list)):
+        
+        failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1)
+        
+        est_failure_rates =  failed / test_labeled.shape[0]
+                
+        failure_rates[i, 6] = np.mean(est_failure_rates)
+        
+        error_Ns[i, 6] = test_labeled.shape[0]
+                    
+    for r in range(1, 9):
+
+        print(".", end="")
+
+        # True evaluation
+
+        FR, N = trueEvaluationEvaluator(test_unlabeled, 'X', 'decision_T', 
+                                        'result_Y', r / 10)
+        
+        failure_rates[r - 1, 0] = FR
+        error_Ns[r - 1, 0] = N
+
+        # Labeled outcomes only
+
+        FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T', 
+                                         'result_Y', r / 10)
+
+        failure_rates[r - 1, 1] = FR
+        error_Ns[r - 1, 1] = N
+
+        # Adjusted labeled outcomes
+
+        FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T',
+                                         'result_Y', r / 10, adjusted=True)
+
+        failure_rates[r - 1, 2] = FR
+        error_Ns[r - 1, 2] = N
+
+        # Human evaluation
+
+        FR, N = humanEvaluationEvaluator(test_labeled, 'judgeID_J', 
+                                         'decision_T', 'result_Y', 
+                                         'acceptanceRate_R', r / 10)
+
+        failure_rates[r - 1, 3] = FR
+        error_Ns[r - 1, 3] = N
+
+        # Contraction
+
+        FR, N = contraction(test_labeled, 'judgeID_J', 'decision_T',  
+                            'result_Y', 'B_prob_0_model', 'acceptanceRate_R', r / 10)
+
+        failure_rates[r - 1, 4] = FR
+        error_Ns[r - 1, 4] = N
+
+        # Causal model - analytic solution
+
+        FR, N = monteCarloEvaluator(test_labeled, 'X', 'decision_T', 
+                                    'result_Y', 'acceptanceRate_R', r / 10)
+
+        failure_rates[r - 1, 5] = FR
+        error_Ns[r - 1, 5] = N
+        
+    failure_SEs = standardError(failure_rates, error_Ns)
+
+    x_ax = np.arange(0.1, 0.9, 0.1)
+
+    labels = [
+        'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
+        'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes'
+    ]
+    colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c']
+
+    for i in range(failure_rates.shape[1]):
+        plt.errorbar(x_ax,
+                     failure_rates[:, i],
+                     label=labels[i],
+                     c=colours[i],
+                     yerr=failure_SEs[:, i])
+
+    plt.title('Failure rate vs. Acceptance rate')
+    plt.xlabel('Acceptance rate')
+    plt.ylabel('Failure rate')
+    plt.legend()
+    plt.grid()
+    
+    plt.savefig(save_name + '_all')
+    
+    plt.show()
+
+    print("\nFailure rates:")
+    print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
+    
+    print("\nMean absolute errors:")
+    for i in range(1, failure_rates.shape[1]):
+        print(
+            labels[i].ljust(len(max(labels, key=len))),
+            np.round(
+                np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
+
+
+sm = pystan.StanModel(file=stan_code_file_name)
+
+if which == 1:
+    print("\nWithout unobservables (Bernoulli + independent decisions)")
+
+    dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z"
+    )
+
+gc.collect()
+plt.close('all')
+
+print("\nWith unobservables in the data")
+
+if which == 2:
+    print("\nBernoulli + independent decisions")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Bernoulli + independent decisions, with unobservables",
+        figure_path + "sl_bernoulli_independent_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 3:
+    print("\nThreshold rule + independent decisions")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Threshold rule + independent decisions, with unobservables",
+        figure_path + "sl_threshold_independent_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 4:
+    print("\nBernoulli + non-independent (batch) decisions")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Bernoulli + non-independent decisions, with unobservables",
+        figure_path + "sl_bernoulli_batch_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 5:
+    print("\nThreshold rule + non-independent (batch) decisions")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Threshold rule + non-independent decisions, with unobservables",
+        figure_path + "sl_threshold_batch_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 6:
+    print("\nRandom decider")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: randomDecider(
+        x, nJudges_M=M_sim, use_acceptance_rates=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + random decider with leniency and unobservables",
+        figure_path + "sl_random_decider_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 7:
+    print("\nBiased decider")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + biased decider with leniency and unobservables",
+        figure_path + "sl_biased_decider_with_Z",
+    )
+
+
+if which == 8:
+    print("\nBad judge")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + 'bad' decider with leniency and unobservables",
+        figure_path + "sl_bad_decider_with_Z"
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 9:
+    print("\nBernoulli + Bernoulli")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + Bernoulli",
+        figure_path + "sl_bernoulli_bernoulli_with_Z",
+    )
+    
+if which == 10:
+    print("\nBeta_Z = 3, Threshold + batch")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Beta_Z = 3, threshold + batch",
+        figure_path + "sl_threshold_batch_beta_Z_3_with_Z",
+    )
+    
+if which == 11:
+    print("\nBeta_Z = 5, Threshold + batch")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Beta_Z = 5, threshold + batch",
+        figure_path + "sl_threshold_batch_beta_Z_5_with_Z",
+    )
\ No newline at end of file
diff --git a/paper/sl.tex b/paper/sl.tex
index b365c1b..7f24905 100755
--- a/paper/sl.tex
+++ b/paper/sl.tex
@@ -35,7 +35,7 @@
 
 \newcommand{\antti}[1]{{{\color{orange} [AH: #1]}}}
 
-\newcommand{\ourtitle}{A Causal Approach for Selective Labels}
+\newcommand{\ourtitle}{Working title: From would-have-beens to should-have-beens: Counterfactuals in model evaluation}
 
 \input{macros}
 \usepackage{chato-notes}
@@ -76,39 +76,40 @@
 	\end{itemize}
 \item Motivation for the study
 	\begin{itemize}
-		\item Lot of decisions are being made which affects the course of human lives
-		\item Using computational models could enhance the decision-making process in terms of accuracy and fairness.
-		\item The advantage of using models does not necessarily lie in pure performance, that a machine can make multiple decisions, but rather in that a machine can learn from a vast set of information and that with care, a machine can be made as unbiased as possible.
-		\item The explainability of black-box models has been discussed in X.
-		\item We don't discuss fairness.
-		\item Selective labeling is an issue in multiple fields where machine learning algorithms could be deployed. (Loans, medicine, justice, insurance, ...)
-		\item Before deploying any algorithms, they should be audited to show that they actually improve on human decision-making.
-		\item Auditing algorithms in conventional settings is trivial, (almost) all of the labels available, numerous metrics have been proposed and are in use in multiple fields.
+		\item Lot of decisions are being made which affect the course of human lives
+		\item Computational models could enhance the decision-making process in accuracy and fairness.
+		\item The advantage of using models does not necessarily lie in pure performance, that a machine can make more decisions, but rather in that a machine can give bounds for uncertainty and can learn from a vast set of information and that with care, a machine can be made as unbiased as possible.
+		\item Fairness has been discussed in the existing literature and numerous publications are available for interested readers. Our emphasis on this paper is on pure performance, getting the predictions accurate.
+		\item Before deploying any algorithms, they should be evaluated to show that they actually improve on human decision-making.
+		\item Evaluating algorithms in conventional settings is trivial, when (almost) all of the labels are available, numerous metrics have been proposed and are in use in multiple fields.
 	\end{itemize}
 \item Present the setting and challenge:
 	\begin{itemize}
-		\item `Selective labels' settings arise in situations where data are the product of a decision mechanism that prevents us from observing outcomes for part of the data.
-		\item A typical example is that of bail-or-jail decisions in judicial settings: a judge decides whether to grant bail to a defendant based on whether the defendant is considered likely to violate bail conditions while awaiting trial -- and therefore a violation might occur only in case bail is granted.
+		\item Specifically, `Selective labels' settings arise in situations where data are the product of a decision mechanism that prevents us from observing outcomes for part of the data.
+		\item A typical example is that of bail-or-jail decisions in judicial settings: a judge decides whether to grant bail to a defendant based on whether the defendant is considered likely to violate bail conditions while awaiting trial -- and therefore a violation might occur only in case bail is granted. Naturally similar scenarios are observed throughout many walks of life from banking to medicine.
 		\item Such settings give rise to questions about the effect of alternative decision mechanisms  -- e.g., `how many defendants would violate bail conditions if more bail decisions were granted?'.
-		\item In other words, one faces the challenge to estimate the performance of an alternative, potentially automated, decision policy that might make different decisions than the one found in the existing data.
-		\item Missing labels and decisions made by different deciders
+		\item In other words, one faces the challenge to estimate the performance of an alternative, potentially automated, decision policy that might make different decisions than the ones found in the existing data.
+		\item Characteristically, in many of the settings the decisions hiding the outcomes are made by different deciders
 		\item Labels are missing non-randomly, decisions might be made by different deciders who differ in leniency.
-		\item (Note: our approach doesn't require multiple deciders)
-		\item In settings like judicial bail decisions, some outcomes cannot be observed due to the nature of the decisions. This results in a complicated missing data problem where the missingness of an item is connected with its outcome and where the available labels are a non-random sample of the underlying population. Recently this problem has been named the selective labels problem.
+		\item So this might lead to situation where subjects with same characteristics may be given different decisions due to the differing leniency.
+		\item Of course the differing decisions might be attributable to some unobserved information that the decision-maker might have had available ude to meeting with the subject.
+		\item The explainability of black-box models has been discussed in X. We don't discuss fairness.
+		\item In settings like judicial bail decisions, some outcomes cannot be observed due to the nature of the decisions. This results in a complicated missing data problem where the missingness of an item is connected with its outcome and where the available labels aren't a random sample of the true population. Recently this problem has been named the selective labels problem.
 	\end{itemize}
 \item Related work
 	\begin{itemize}
-		\item Lakkaraju presented contraction which performed well compared to other methods previously presented in the literature. We wanted to benchmark our approach to that and show that we can improve on their algorithm in terms of restrictions and accuracy. 
+		\item In the original paper, Lakkaraju et al. presented contraction which performed well compared to other methods previously presented in the literature. 
+		\item We wanted to benchmark our approach to that and show that we can improve on their algorithm in terms of restrictions and accuracy. 
 		\item Restrictions = our method doesn't have so many assumptions (random assignments, agreement rate, etc.) and can estimate the performance on all levels of leniency despite the judge with the highest leniency. See fig 5 from Lakkaraju
-		\item Jung et al presented their method for constructing optimal policy, we show that that approach can be applied to the selective labels setting.
+		\item Jung et al presented their method for constructing optimal policies, we show that that approach can also be applied to the selective labels setting.
 		\item They didn't have selective labeling nor did they consider that the judges would differ in leniency.
-		\item Selection bias has been extensively discussed in the causal inference literature (Pearl, Bareinboim etc.)
+		\item Selective labels issue has been addressed in the causal inference literature by discussing selection bias. discussion has mainly been concentrated on recovering causal effects + model structure has usually been different (Pearl, Bareinboim etc.)
+		\item Latent confounding has bee discussed by X when discussing the effect of latent confounders to ORs. ec etc.
 	\end{itemize}
 \item Our contribution
 	\begin{itemize}
-	\item In this paper we propose a (novel modular) framework for presenting these missing data problems by breaking it into different modules and explicating their function.
-	\item In addition, we present an approach for inferring the missing labels to evaluate the performance of predictive models in settings where selective labeling and latent confounding is present. We use a flexible Bayesian approach to estimate the failure rate of a given model.
-	\item We show that our method is robust against violations and modifications in the data generating mechanisms.
+	\item In this paper we propose a (novel modular) framework to provide a systematic way of presenting these missing data problems by breaking it into different modules and explicating their function.
+	\item In addition, we present an approach for inferring / imputing the missing labels to evaluate the performance of predictive models in settings where selective labeling and latent confounding is present. We use theory of counterfactuals and causal inference to formally define the problem. In computation we use the latest tools. / "a flexible, Bayesian approach".
 	\end{itemize}
 \end{itemize}
 
@@ -119,49 +120,59 @@ In this section, we define the key terms used in this paper, present the modular
 
 \begin{itemize}
 \item Definitions \\
-	In this paper we apply our approach on binary outcomes, but our approach is readily modifiable to accompany continuous or categorical responses. Then we could use e.g. sum of squared errors or other appropriate metric as the measure for performance.
+	In this paper we apply our approach on binary (positive / negative) outcomes, but our approach is readily extendable to accompany continuous or categorical responses. Then we could use e.g. sum of squared errors or other appropriate metrics as the measure for good performance.
+	With positive or negative outcomes we refer to...
 	\begin{itemize}
 	\item Failure rate
 		\begin{itemize}
 		\item Failure rate (FR) is defined as the ratio of undesired outcomes to given decisions. One special characteristic of FR in this setting is that a failure can only occur with a positive decision / we can only observe the outcome when the corresponding decision is positive.
-		\item That means that a failure rate of zero can be achieved just by not giving any positive decisions but that is not the ultimate goal.
+		\item That means that a failure rate of zero can be achieved just by not giving any positive decisions but that is not the ultimate goal. (rather about finding a good balance. > Resource issues in prisons etc.)
 		\end{itemize}
 	\item Acceptance rate
 		\begin{itemize}
-		\item Acceptance rate (AR) is defined as the ratio of positive decisions to all decisions that a decision-maker will give.
+		\item Acceptance rate (AR) or leniency is defined as the ratio of positive decisions to all decisions that a decision-maker will give. (Semantically, what is the difference between AR and leniency? AR is always computable, leniency doesn't manifest.)
 		\item In some settings, (justice, medicine) people might want to find out if X\% are accepted what is the resulting failure rate, and what would be the highest acceptance rate to have to have the failure rate at an acceptable level. 
 		\item We want to know the trade-off between acceptances and failure rate.
 		\item Lakkaraju mentioned the problem in the data that judges which have a higher leniency have labeled a larger portion of the data (which might results in bias).
+		\item As mentioned earlier, these differences in AR might lead to subjects getting different decisions while haven the same observable and unobservable characteristics.
 		\end{itemize}
-	\item With decider or decision-maker we might refer to a judge, a doctor, ... who makes the decisions on which labels are available. Some deciders might have an incentive for positive decisions if it can mean e.g. savings. Judge makes saving by not jailing a defendant. Doctor makes savings by not assigning patient for a higher intensity care. (move to motivation?)
-	\item With unobservables we refer to some latent, non-written information regarding a certain outcome that is only available to the decision-maker.
+	\item With decider or decision-maker we might refer to a judge, a doctor, ... who makes the decisions on which labels are available. % Some deciders might have an incentive for positive decisions if it can mean e.g. savings. Judge makes saving by not jailing a defendant. Doctor makes savings by not assigning patient for a higher intensity care. (move to motivation?)
+	\item With unobservables we refer to some latent, usually non-written information regarding a certain outcome that is only available to the decision-maker. For example, a judge in court can observe the defendant's behaviour and level of remorse which might be indicative of bail violation. We denote the latent information regarding a person's guilt with variable \unobservable.
 \end{itemize}
 \item Modules \\
-	We separated steps that modify the data into separate modules in order to formally define their inner workings. Modules have different functions, inputs and outputs. Modules are interchangeable with a similar type of module (You can change decider module of type A with decider module of type B).
+	We separated steps that modify the data into separate modules to formally define how they work. With observational data sets, the data goes through only a modelling step and an evaluation step. With synthetic data, we also need to define a data generating  step. We call the blocks doing these steps {\it modules}. To fully define a module, one must define its input and output. Modules have different functions, inputs and outputs. Modules are interchangeable with a similar type of module if they share the same input and output (You can change decider module of type A with decider module of type B). With this modular framework we achieve a unified way of presenting the key differences in different settings.
 	\begin{itemize}
 	\item Decider modules
 		\begin{itemize}
 		\item In general, the decider module assigns predictions to the observations based on some information.
-		\item The information available to a decider in the decider module includes observable and -- possibly -- unobservable features, denoted with X and Z respectively.
-		\item The predictions given by a decider module can be relative or absolute. With relative predictions we refer to that a decider module can give out a ranking or an ordering of the subjects based on their predicted tendency towards an outcome. Absolute predictions can be either binary or continuous in nature. They can correspond to yes or no decisions or to a probability value.
-		\item Inner workings of the module may or may not be known. In observational data sets, the mechanism or the decider which has labeled the data is usually unknown.
+		\item The information available to a decision-maker in the decider module includes observable and -- possibly -- unobservable features, denoted with X and Z respectively.
+		\item The predictions given by a decider module can be relative or absolute. With relative predictions we refer to that a decider module can give out a ranking of the subjects based on their predicted tendency towards an outcome. Absolute predictions can be either binary or continuous in nature. For example, they can correspond to yes or no decisions or to a probability value.
+		\item Inner workings (procedure/algorithm) of the module may or may not be known. In observational data sets, the mechanism or the decider which has labeled the data is usually unknown. E.g. we do not -- eactly -- know how judges obtain a decision. Conversely, in synthetic data sets the procedure creating the decisions is fully known because we define the process.
 		\item The decider (module) in the data step has unobservable information available for making the decisions. 
-		\item The behaviour of the decider module in the data step can be defined in many ways. We have used both the method presented by Lakkaraju et al. and two methods of our own. We created these two deciders to remove the interdependencies of the decisions made by the decider Lakkaraju et al. presented.
+		\item The behaviour of the decider module in the data generating step can be defined in many ways. We have used both the method presented by Lakkaraju et al. and two methods of our own. We created these two deciders to remove the interdependencies of the decisions made by the decider Lakkaraju et al. presented.
 		\item The difference between the deciders in the data and modelling steps is that usually we cannot observe all the information that has been available to the decider in the data step as opposed to the decider in the modelling step. In addition, we usually cannot observe the full decision-making process of the decider in the data step contrary to the decider in the modelling step.
 		\end{itemize}
 	\item Evaluator modules
 		\begin{itemize}
-		\item Evaluator module gets the decisions, observable features of the subject and predictions made by the deciders in the data and modelling
-		\item The evaluator module outputs a reliable estimate of a decider module's performance. The estimate is created by the evaluator module and it should be precise and unbiased and it should have a low variance. The output of the evaluator module should also be as robust as possible to slight changes in the data generation. The estimate of the evaluator should also be accurate for all levels of leniency of the deciders.
+		\item Evaluator module gets the decisions, observable features of the subject and predictions made by the deciders and outputs an estimate of...
+		\item The evaluator module outputs a reliable estimate of a decider module's performance. The estimate is created by the evaluator module and it should 
+			\begin{itemize}
+			\item be precise and unbiased
+			\item have a low variance
+			\item be as robust as possible to slight changes in the data generation. 
+			\end{itemize}
+		\item The estimate of the evaluator should also be accurate for all levels of leniency.
 		\end{itemize}
 	\end{itemize}
 %\item Example: in observational data sets, the deciders have already made decision concerning the subjects and we have a selectively labeled data set available. In the modular framework we refer to the actions of the human labelers as a decider module which has access to latent information. 
 \item Problem formulation \\
-Given the selective labeling of data and the latent confounders present, our goal is to create an evaluator module that can output a reliable estimate of a given decider module's performance. We use acceptance rate and failure rate as measures against which we compare our evaluators because they have direct and easily understandable counterparts in the real world / applicable domains. The evaluator module should be able to accurately estimate the failure rate for all levels of leniency and all data sets.
+Given the selective labeling of data, multiple decision-makers and the latent confounders present, our goal is to create an evaluator module that can output a reliable estimate of a given decider module's performance. We use acceptance rate and failure rate as measures against which we compare our evaluators because they have direct and easily understandable counterparts in the applicable domains.
 
-The "eventual goal" is to create such an evaluator module that it can outperform (have a lower failure on all levels of acceptance rate) the deciders in the data generating process. The problem is of course comparing the performance of the deciders. We try to address that.
+%The "eventual goal" is to create such a decider module that it can outperform (have a lower failure on all levels of acceptance rate) the deciders in the data generating process. The problem is of course comparing the performance of the deciders. We try to address that.
+
+%(It's important to somehow keep these two different goals separate.)
+We show that our method is robust against violations and modifications in the data generating mechanisms.
 
-(It's important to somehow keep these two different goals separate.)
 \end{itemize}
 
 \section{Counterfactual-Based Imputation For Selective Labels}
@@ -173,13 +184,13 @@ The "eventual goal" is to create such an evaluator module that it can outperform
 		\begin{itemize}
 		\item hypothesized quantities that encode the would-have-been relation of the outcome and the treatment assignment.
 		\item Using counterfactuals, we can discuss hypothetical events that didn't happen. 
-		\item Using counterfactuals requires defining a structural model
-		\item Pearl's Book of Why "The fundamental problem"
+		\item Using counterfactuals requires defining a structural causal model.
+		\item Pearl's Book of Why: "The fundamental problem"
 		\end{itemize}
 	\item By defining structural equations / a graph
 		\begin{itemize}
 		\item we can begin formulating causal questions to get answers to our questions.
-		\item Once we have defined the equations, counterfactuals obtained by...
+		\item Once we have defined the equations, counterfactuals are obtained by... (abduction, action, prediction, don't we apply the do operator on the \decision, so that we obtain $\outcome_{\decision=1}(x)$?)
 		\item We denote the counterfactual "Y had been y had T been t" with...
 		\item By first estimating the distribution of the latent variable Z we can impose 
 		\item Now counterfactuals can be defined as
@@ -193,30 +204,31 @@ The "eventual goal" is to create such an evaluator module that it can outperform
 	\item In a high level
 		\begin{itemize}
 		\item there is usually some data recoverable from the unobservables. For example, if the observable attributes are contrary to the outcome/decision we can claim that the latent variable included some significant information.
-		\item We retrieve this information using the prespecified structural equations. After estimating the desired parameters, we can estimate the value of the counterfactual (not observed) outcome by switching the value of X and doing the computations through the rest of the graph...
+		\item We retrieve this information using the prespecified structural equations. After estimating the desired parameters, we can estimate the value of the counterfactual (not observed) outcome by switching the value of \decision and doing the computations through the rest of the graph...
 		\end{itemize}
+	\item Because the causal effect of \decision to \outcome is not identifiable, we used a Bayesian approach
 	\item Recent advances in the computational methods provide us with ways of inferring the value of the latent variable by applying Bayesian techniques to... Previously this kind of analysis required us to define X and compute Y...
 \end{itemize}
 \item Model (Structure, equations in a general and more specified level, assumptions, how we construct the counterfactual...) 
 	\begin{itemize}
-	\item Structure is as is in the diagram. Square around Z represents that it's unobservable/latent
+	\item Structure is as is in the diagram. Square around Z represents that it's unobservable/latent.
 	The features of the subjects include observable and -- possibly -- unobservable features, denoted with X and Z respectively. The only feature of a decider is their leniency R (depicting some baseline probability of a positive decision). The decisions given will be denoted with T and the resulting outcomes with Y, where 0 stands for negative outcome or decision and 1 for positive.
 	\item The causal diagram presents how decision T is affected by the decider's leniency (R), the subject's observable private features (X) and the latent information regarding the subject's tendency for a negative outcome (Z). Correspondingly the outcome (Y) is affected only by the decision T and the above-mentioned features X and Z. 
 	\item The causal directions and implied independencies are readable from the diagram. We assume X and Z to be independent.
 	\item The structural equations connecting the variables can be formalized in a general level as (see Jung)
 		\begin{align} \label{eq:structural_equations}
-		\outcome(0) & = 1 / NA? \\ \nonumber
-		\outcome(1) & \sim f(\featuresValue, \unobservableValue; \beta_{\featuresValue\outcomeValue}, \beta_{\unobservableValue\outcomeValue}) \\ \nonumber
-		\decision      & \sim g(\featuresValue, \unobservableValue; \beta_{\featuresValue\decisionValue}, \beta_{\unobservableValue\decisionValue}, \alpha_j), \\ \nonumber
-		\outcome & =\outcome(\decisionValue)\\ \nonumber
+		\outcome_0 & = NA \\ \nonumber
+		\outcome_1 & \sim f(\featuresValue, \unobservableValue; \beta_{\featuresValue\outcomeValue}, \beta_{\unobservableValue\outcomeValue}) \\ \nonumber
+		\decision      & \sim g(\featuresValue, \unobservableValue; \beta_{\featuresValue\decisionValue}, \beta_{\unobservableValue\decisionValue}, \alpha_j) \\ \nonumber
+		\outcome & =\outcome_\decisionValue \\ \nonumber
 		\end{align}
 	where the beta and alpha coefficients are the path coefficients specified in the causal diagram
 	\item This general formulation of the selective labels problem enables the use of this approach even when the outcome is not binary. Notably this approach -- compared to that of Jung et al. -- explicates the selective labels issue to the structural equations when we deterministically set the value of outcome y to be one in the event of a negative decision. In addition, we allow the judges to differ in the baseline probabilities for positive decisions, which is by definition leniency.
 	\item Now by imposing a value for the decision \decision we can obtain the counterfactual by simply assigning the desired value to the equations in \ref{eq:structural_equations}. This assumes that... (Consistency constraint) Now we want to know {\it what would have been the outcome \outcome for this individual \featuresValue had the decision been $\decision = 1$, or more specifically $\outcome_{\decision = 1}(\featuresValue)$}.
 	\item To compute the value for the counterfactuals, we need to obtain estimates for the coefficients and latent variables. We specified a Bayesian (/structural) model, which requires establishing a set of probabilistic expressions connecting the observed quantities to the parameters of interest. The relationships of the variables and coefficients are presented in equation \ref{eq:structural_equations} and figure X in a general level. We modelled the observed data as  
 		\begin{align} \label{eq:data_model}
-		 y(1) & \sim \text{Bin}(1, \invlogit(\beta_{xy,k[i]} x + \beta_{zy,k[i]} z_i)) \\ \nonumber
-		 t & \sim \text{Bin}(1, \invlogit(\alpha_{j[i]} + \beta_{xt,k[i]} x + \beta_{zt,k[i]}z)). \\ \nonumber
+		 y(1) & \sim \text{Bernoulli}(\invlogit(\beta_{xy} x + \beta_{zy} z)) \\ \nonumber
+		 t & \sim \text{Bernoulli}(\invlogit(\alpha_{j} + \beta_{xt} x + \beta_{zt}z)). \\ \nonumber
 		\end{align}
 	\item Bayesian models also require the specification of prior distributions for the variables of interest to obtain an estimate of their distribution after observations, the posterior distribution.
 	\item Identifiability of models with unobserved confounding has been discussed by eg McCandless et al and Gelman. As by Gelman we note that scale-invariance has been tackled with specifying the priors.  (?)
@@ -224,13 +236,15 @@ The "eventual goal" is to create such an evaluator module that it can outperform
 	\end{itemize}
 \item Computation (Stan in general, ...)
 	\begin{itemize}
-	\item Using the model specified in equation X, we used Stan to estimate the intercepts, path coefficients and latent variables. Stan provides tools for efficient computational estimates of posterior distributions.  Stan uses No-U-Turn Sampling (NUTS), an extension of Hamiltonian Monte Carlo (HMC) algorithm, to computationally estimate the posterior distribution for inferences. (In a high level, the sampler utilizes the gradient of the posterior to compute potential and kinetic energy of an object in the multi-dimensional surface of the posterior to draw samples from it.) Stan also has implementations of black-box variational inference algorithms and direct optimization algorithms for the posterior distribution but they were deemed to be insufficient for estimating the posterior in this setting
+	\item Using the model specified in equation \ref{eq:data_model}, we used Stan to estimate the intercepts, path coefficients and latent variables. Stan provides tools for efficient computational estimates of posterior distributions.  Stan uses No-U-Turn Sampling (NUTS), an extension of Hamiltonian Monte Carlo (HMC) algorithm, to computationally estimate the posterior distribution for inferences. (In a high level, the sampler utilizes the gradient of the posterior to compute potential and kinetic energy of an object in the multi-dimensional surface of the posterior to draw samples from it.) Stan also has implementations of black-box variational inference algorithms and direct optimization algorithms for the posterior distribution but they were deemed to be insufficient for estimating the posterior in this setting
 	\item Chain lengths were set to X and number of chains deployed was Y. (Explain algorithm fully later)
 	\end{itemize}
 \end{itemize}
 
 \section{Extension To Non-Linearity (2nd priority)}
 
+% If X has multiple dimensions or the relationships between the features and the outcomes are clearly non-linear the presented approach can be extended to accomodate non-lineairty. Jung proposed that... Groups... etc etc.
+
 \section{Related work}
 
 \begin{itemize}
@@ -252,12 +266,12 @@ In this section we present our results from experiments with synthetic and reali
 \begin{itemize}
 \item Data generation
 	\begin{itemize}
-	\item We experimented with synthetic data sets to show that our method is accurate, unbiased and low variance.
-	\item We imitated the data generation process presented by Lakkaraju et al. for the benchmarking to make sense.
+	\item We experimented with synthetic data sets to show that our method is accurate, unbiased and robust to violations of the assumptions.
+	\item We imitated the data generation process presented by Lakkaraju et al. for benchmarking purposes.
 	\item We created data by sampling N=50k observations from three independent standard gaussians.
 	\item The observations were assigned to variables X, Z, and W.
-	\item We then drew the outcome Y from a Bernoulli distribution with parameter p=1 -- logit-1(...) 
-	\item This is one data generation module. It can be / was modified by changing the outcome producing mechanism.
+	\item We then drew the outcome Y from a Bernoulli distribution with parameter $p = 1 - \invlogit(\beta_xx+\beta_zz+\beta_ww)$ so that $\prob{Y=0|X, Z, W} =  \invlogit(\beta_xx+\beta_zz+\beta_ww)$ where the coefficients for X, Z and W were set to 1, 1, 0.2 respectively. 
+	\item This is one data generation module. It can be / was modified by changing the outcome producing mechanism. For other exeriments we changed the outcome generating mechanism so that the outcome was assigned value 1 if 
 	\item Next, the decisions were assigned by computing the quantile the subject belongs to. The quantile was obtained as the inverse cdf of ... . 
 	\item This way the observations were independent and the still the leniency would be a good estimate of the acceptance rate. (The acceptance rate would stochastically converge to the leniency.)
 	\item This is a decider module. We experimented with different combinations of decider and data generating modules to show X / see Y. (to see that our method is robust against non-informative, biased and bad decisions . Due to space constraints we defer these results...)
@@ -267,7 +281,9 @@ In this section we present our results from experiments with synthetic and reali
 	\begin{itemize}
 	\item True evaluation
 		\begin{itemize}
-		\item Depicts the true performance of the model. "How well would this model perform had it been deployed?" Not available in real data. Calculated by ordering the observations based on the predictions from the black-box model B and counting the failure rate from the ground truth labels.
+		\item Depicts the true performance of the model. "How well would this model perform had it been deployed?" 
+		\item Not available when using observational data. 
+		\item Calculated by ordering the observations based on the predictions from the black-box model B and counting the failure rate from the ground truth labels.
 		\end{itemize}
 	\item Human evaluation
 		\begin{itemize}
@@ -282,18 +298,43 @@ In this section we present our results from experiments with synthetic and reali
 		\end{itemize}
 	\item Contraction
 		\begin{itemize}
-		\item Algorithm by Lakkaraju et al. Depends on the random assigning of subjects to judges and requires that the judges differ in leniency. 
-		\item Can estimate the true failure only to q.
-		\item Performance is affected by the number of people judged by the most lenient decision-maker, the agreement rate and the leniency of the most lenient decision-maker.
+		\item Algorithm by Lakkaraju et al. Assumes that the subjects are assigned to the judges at random and requires that the judges differ in leniency. 
+		\item Can estimate the true failure only up to the leniency of the most lenient decision-maker.
+		\item Performance is affected by the number of people judged by the most lenient decision-maker, the agreement rate and the leniency of the most lenient decision-maker. (Performance is guaranteed / better when ...)
 		\item Works only on binary outcomes
 		\item (We show that our method isn't constrained by any of these)
 		\item The algorithm goes as follows...
+%\begin{algorithm}[] 			% enter the algorithm environment
+%\caption{Contraction algorithm \cite{lakkaraju17}} 		% give the algorithm a caption
+%\label{alg:contraction} 			% and a label for \ref{} commands later in the document
+%\begin{algorithmic}[1] 		% enter the algorithmic environment
+%\REQUIRE Labeled test data $\D$ with probabilities $\s$ and \emph{missing outcome labels} for observations with $T=0$, acceptance rate r
+%\ENSURE
+%\STATE Let $q$ be the decision-maker with highest acceptance rate in $\D$.
+%\STATE $\D_q = \{(x, j, t, y) \in \D|j=q\}$
+%\STATE \hskip3.0em $\rhd$ $\D_q$ is the set of all observations judged by $q$
+%\STATE
+%\STATE $\RR_q = \{(x, j, t, y) \in \D_q|t=1\}$
+%\STATE \hskip3.0em $\rhd$ $\RR_q$ is the set of observations in $\D_q$ with observed outcome labels
+%\STATE
+%\STATE Sort observations in $\RR_q$ in descending order of confidence scores $\s$ and assign to $\RR_q^{sort}$.
+%\STATE \hskip3.0em $\rhd$ Observations deemed as high risk by the black-box model $\mathcal{B}$ are at the top of this list
+%\STATE
+%\STATE Remove the top $[(1.0-r)|\D_q |]-[|\D_q |-|\RR_q |]$ observations of $\RR_q^{sort}$ and call this list $\mathcal{R_B}$
+%\STATE \hskip3.0em $\rhd$ $\mathcal{R_B}$ is the list of observations assigned to $t = 1$ by $\mathcal{B}$
+%\STATE
+%\STATE Compute $\mathbf{u}=\sum_{i=1}^{|\mathcal{R_B}|} \dfrac{\delta\{y_i=0\}}{| \D_q |}$.
+%\RETURN $\mathbf{u}$
+%\end{algorithmic}
+%\end{algorithm}
 		\end{itemize}
 	\item Potential outcomes / CBI
 		\begin{itemize}
 		\item Take test set
-		\item Estimate the parameters from equations XX
-		\item Using the posterior predictive distribution, obtain a point estimate for the failure rate
+		\item Compute the posterior for parameters and variables presented in equation \ref{eq:data_model}.
+		\item Using the posterior predictive distribution, draw estimates for the counterfactuals.
+		\item Impute the missing outcomes using the estimates from previous step
+		\item Obtain a point estimate for the failure rate by computing the mean.
 		\item Estimates for the counterfactuals Y(1) for the unobserved values of Y were obtained using the posterior expectations from Stan. We used the NUTS sampler to estimate the posterior. When the values for...
 		\end{itemize}
 	\end{itemize}
@@ -315,9 +356,9 @@ In this section we present results from experiments with (realistic) data sets.
 		\begin{itemize}
 		\item COMPAS = Correctional Offender Management Profiling for Alternative Sanctions is Northpointe's (now diff. name) tool for guiding decisions in the criminal justice system.
 		\item COMPAS general recidivism risk score is made to predict recidivism in the following two years,
-		\item Data comprises of 6172 subjects assessed at Broward county, California. 
-		\item Data was made available and was preprocessed by ProPublica.
-		\item Their analysis and results are presented in the original article "Machine Bias".
+		\item The final data set comprises of 6172 subjects assessed at Broward county, California. The data was preprocessed to include only subjects assessed at the pretrial stage and (something about traffic charges).
+		\item Data was made available ProPublica.
+		\item Their analysis and results are presented in the original article "Machine Bias" in which they argue that the COMPAS metric assigns biased risk evaluations based on race.
 		\item Data includes the subjects' demographic information (incl. gender, age, race) and information on their previous offences. 
 		\end{itemize}
 	\item Subsequent modifications for analysis 
@@ -328,12 +369,12 @@ In this section we present results from experiments with (realistic) data sets.
 		\item As the COMPAS score is derived mainly from "prior criminal history, criminal associates, drug involvement, and early indicators of juvenile delinquency problems" so it can be said to have external information available, not coded into the four above-mentioned variables. (quoted text copy-pasted from here)
 		\item Data was split to test and training sets
 		\item A logistic regression model was built to predict two-year recidivism from categorized age, gender, the number of priors, degree of crime COMPAS screened for (felony/misdemeanor)
+		\item We used these same variables as input to the CBI evaluator.
 		\end{itemize}
 	\item Results
 		\begin{itemize}
-		\item Results from this analysis is presented in figure X. In the figure we see that CBI follows the true evaluation curve very closely when there are more than K=? groups. Our approach also has a lower standard deviation.
+		\item Results from this analysis are presented in figure X. In the figure we see that CBI follows the true evaluation curve very closely.
 		\item We can also deduce from the figure that if this predictive model was to be deployed, it wouldn't necessarily improve on the decisions made by these synthetic judges.
-		\item We experimented also with different values of prior variance, but it didn't affect the results.
 		\end{itemize}
 	\end{itemize}
 \item Catalonian data (this could just be for our method? Hide ~25\% of outcome labels and show that we can estimate the failure rate for ALL levels of leniency despite the leniency of this one judge is only 0.25) (2nd priority)
-- 
GitLab