diff --git a/analysis_and_scripts/code_jung.stan b/analysis_and_scripts/code_jung.stan
new file mode 100644
index 0000000000000000000000000000000000000000..8ee246647553fdac0c131969cccc86e86c09a732
--- /dev/null
+++ b/analysis_and_scripts/code_jung.stan
@@ -0,0 +1,91 @@
+data {
+  int<lower=1> D; // Dimensions
+  int<lower=0> N_obs;  // Number of "observed observations" (with T = 1)
+  int<lower=0> N_cens; // Number of "censored observations" (with T = 0)
+  int<lower=1> M; // Number of judges
+  int<lower=1> K; // Number of groups
+  real<lower=0> sigma_tau;
+  int<lower=1, upper=M> jj_obs[N_obs];   // judge_ID
+  int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID
+  int<lower=1, upper=K> kk_obs[N_obs];   // Grouping
+  int<lower=1, upper=K> kk_cens[N_cens]; // Grouping
+  int<lower=0, upper=1> dec_obs[N_obs];   // Positive decisions
+  int<lower=0, upper=1> dec_cens[N_cens]; // Negative decisions
+  row_vector[D] X_obs[N_obs];   // Private features of "observed observations" (with T = 1)
+  row_vector[D] X_cens[N_cens]; // Private features of "censored observations" (with T = 1)
+  int<lower=0, upper=1> y_obs[N_obs]; // Observed outcomes
+}
+
+parameters {
+  vector[N_obs] Z_obs;
+  vector[N_cens] Z_cens;
+  real alpha_T[M];  // Judge-specific intercepts
+  
+  vector[D] beta_XT[K]; 
+  vector[D] beta_XY[K];
+  
+  vector<lower=0>[K] beta_ZT_raw; // Coefficient for the latent variable.
+  vector<lower=0>[K] beta_ZY_raw; // Coefficient for the latent variable.
+  
+  real<lower=0> tau_XT;
+  real<lower=0> tau_XY;
+  real<lower=0> tau_ZT;
+  real<lower=0> tau_ZY;
+}
+
+transformed parameters{
+  vector<lower=0>[K] beta_ZT_cumulative;
+  vector<lower=0>[K] beta_ZY_cumulative;
+  
+  vector<lower=0>[K] beta_ZT;
+  vector<lower=0>[K] beta_ZY;
+  
+  beta_ZT[1] = beta_ZT_raw[1];
+  beta_ZY[1] = beta_ZY_raw[1];
+  
+  if(K >= 2){
+    beta_ZT[2:] = tau_ZT * beta_ZT_raw[2:];
+    beta_ZY[2:] = tau_ZY * beta_ZY_raw[2:];
+  }
+  
+  beta_ZT_cumulative = cumulative_sum(beta_ZT);
+  beta_ZY_cumulative = cumulative_sum(beta_ZY);
+}
+
+model {
+  Z_obs  ~ std_normal();
+  Z_cens ~ std_normal();
+  
+  tau_XY ~ normal(0, sigma_tau);
+  tau_XT ~ normal(0, sigma_tau);
+  tau_ZY ~ normal(0, sigma_tau);
+  tau_ZT ~ normal(0, sigma_tau);
+  
+  beta_XT[1]  ~ std_normal(); // first group
+  beta_XY[1]  ~ std_normal();
+  beta_ZT_raw ~ std_normal();
+  beta_ZY_raw ~ std_normal();
+  
+  if(K >= 2){
+    for(i in 2:K){ // random walk prior here
+      beta_XT[i] ~ normal(beta_XT[i-1], tau_XT); // ith group
+      beta_XY[i] ~ normal(beta_XY[i-1], tau_XY);
+    }
+  }
+
+  for(i in 1:N_obs){
+    dec_obs[i] ~ bernoulli_logit(alpha_T[jj_obs[i]] + X_obs[i] * beta_XT[kk_obs[i]] + beta_ZT_cumulative[kk_obs[i]] * Z_obs[i]);
+    y_obs[i] ~ bernoulli_logit(X_obs[i] * beta_XY[kk_obs[i]] + beta_ZY_cumulative[kk_obs[i]] * Z_obs[i]);
+  }
+  
+  for(i in 1:N_cens)
+    dec_cens[i] ~ bernoulli_logit(alpha_T[jj_cens[i]] + X_cens[i] * beta_XT[kk_cens[i]] + beta_ZT_cumulative[kk_cens[i]] * Z_cens[i]);
+}
+
+generated quantities {
+  real<lower=0, upper=1> y_est[N_cens];
+  
+  for(i in 1:N_cens){
+    y_est[i] = inv_logit(X_cens[i] * beta_XY[kk_cens[i]] + beta_ZY_cumulative[kk_cens[i]] * Z_cens[i]);
+  }
+}
diff --git a/analysis_and_scripts/stan_modelling_empirical.py b/analysis_and_scripts/stan_modelling_empirical.py
new file mode 100644
index 0000000000000000000000000000000000000000..e05c3a7653f0f5f5ae5b57fdae9498687cf2d63b
--- /dev/null
+++ b/analysis_and_scripts/stan_modelling_empirical.py
@@ -0,0 +1,629 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Author: Riku Laine
+# Date: 26JUL2019
+# Project name: Potential outcomes in model evaluation
+# Description: This script creates the figures and results used 
+#              in empirical data experiments.
+#
+# Parameters:
+# -----------
+# (1) figure_path : file name for saving the created figures.
+# (2) nIter : Number of train test splits to perform on the data.
+# (3) group_amount : How many groups if Jung-inspired model is used.
+# (4) stan_code_file_name : Name of file containing the stan model code.
+# (5) sigma_tau : Values of prior variance for the Jung-inspired model.
+# (6) data_path : File of compas data.
+
+"""
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import scipy.stats as scs
+import scipy.special as ssp
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+import pystan
+
+plt.switch_backend('agg')
+
+import sys
+
+# figure storage name
+figure_path = sys.argv[1]
+
+# Iterations
+nIter = int(sys.argv[2])
+
+# How many groups if jung model is used
+group_amount = int(sys.argv[3])
+
+# Name of stan model code file
+stan_code_file_name = sys.argv[4]
+
+# Variance prior
+sigma_tau = float(sys.argv[5])
+
+# figure storage name
+data_path = sys.argv[6]
+
+# Prefix for the figures and log files
+
+print("These results have been obtained with the following settings:")
+
+print("Number of train test splits :", nIter)
+
+print("Number of groups:", group_amount)
+
+print("Prior for the variances:", sigma_tau)
+
+save_name = "sl_compas"
+
+def inv_logit(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def logit(x):
+    return np.log(x) - np.log(1.0 - x)
+
+
+def inverse_cumulative(x, mu, sigma):
+    '''Compute the inverse of the cumulative distribution of logit-normal
+    distribution at x with parameters mu and sigma (mean and st.dev.).'''
+
+    return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
+
+##########################
+
+# ## Evaluator modules
+
+# ### Convenience functions
+
+def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
+    '''
+    Fit a predictive model (default logistic regression) with given training 
+    instances and return probabilities for test instances to obtain a given 
+    class label.
+    
+    Arguments:
+    ----------
+    
+    x_train -- x values of training instances
+    y_train -- y values of training instances
+    x_test -- x values of test instances
+    class_value -- class label for which the probabilities are counted for.
+    model_type -- type of model to be fitted.
+    
+    Returns:
+    --------
+    (1) Trained predictive model
+    (2) Probabilities for given test inputs for given class.
+    '''
+
+    if model_type is None or model_type in ["logistic_regression", "lr"]:
+        # Instantiate the model (using the default parameters)
+        logreg = LogisticRegression(solver='lbfgs')
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            logreg = logreg.fit(x_train, y_train)
+
+        label_probs_logreg = getProbabilityForClass(x_test, logreg,
+                                                    class_value)
+
+        return logreg, label_probs_logreg
+
+    elif model_type in ["random_forest", "rf"]:
+        # Instantiate the model
+        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            forest = forest.fit(x_train, y_train)
+
+        label_probs_forest = getProbabilityForClass(x_test, forest,
+                                                    class_value)
+
+        return forest, label_probs_forest
+
+    elif model_type == "fully_random":
+
+        label_probs = np.ones_like(x_test) / 2
+
+        model_object = lambda x: 0.5
+
+        return model_object, label_probs
+    else:
+        raise ValueError("Invalid model_type!", model_type)
+
+
+def getProbabilityForClass(x, model, class_value):
+    '''
+    Function (wrapper) for obtaining the probability of a class given x and a 
+    predictive model.
+
+    Arguments:
+    -----------
+    x -- individual features, an array of shape (observations, features)
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    Returns:
+    --------
+    (1) The probabilities of given class label for each x.
+    '''
+    if x.ndim == 1:
+        # if x is vector, transform to column matrix.
+        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
+    else:
+        f_values = model.predict_proba(x)
+
+    # Get correct column of predicted class, remove extra dimensions and return.
+    return f_values[:, model.classes_ == class_value].flatten()
+
+
+# ### Contraction algorithm
+# 
+# Below is an implementation of Lakkaraju's team's algorithm presented in 
+# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
+# parameters to be passed to the function are presented in the description.
+
+def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
+                accRateR_col, r):
+    '''
+    This is an implementation of the algorithm presented by Lakkaraju
+    et al. in their paper "The Selective Labels Problem: Evaluating 
+    Algorithmic Predictions in the Presence of Unobservables" (2017).
+
+    Arguments:
+    ----------
+    df -- The (Pandas) data frame containing the data, judge decisions,
+        judge IDs, results and probability scores.
+    judgeIDJ_col -- String, the name of the column containing the judges' IDs
+        in df.
+    decisionT_col -- String, the name of the column containing the judges' decisions
+    resultY_col -- String, the name of the column containing the realization
+    modelProbS_col -- String, the name of the column containing the probability
+        scores from the black-box model B.
+    accRateR_col -- String, the name of the column containing the judges' 
+        acceptance rates
+    r -- Float between 0 and 1, the given acceptance rate.
+
+    Returns:
+    --------
+    (1) The estimated failure rate at acceptance rate r.
+    '''
+    # Get ID of the most lenient judge.
+    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
+
+    # Subset. "D_q is the set of all observations judged by q."
+    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
+
+    # All observations of R_q have observed outcome labels.
+    # "R_q is the set of observations in D_q with observed outcome labels."
+    R_q = D_q[D_q[decisionT_col] == 1].copy()
+
+    # Sort observations in R_q in descending order of confidence scores S and
+    # assign to R_sort_q.
+    # "Observations deemed as high risk by B are at the top of this list"
+    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
+
+    number_to_remove = int(
+        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
+
+    # "R_B is the list of observations assigned to t = 1 by B"
+    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
+
+    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0]
+
+
+# ### Evaluators
+
+def contractionEvaluator(df, featureX_col, judgeIDJ_col, decisionT_col,
+                         resultY_col, accRateR_col, r):
+
+    train, test = train_test_split(df, test_size=0.5)
+
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train[decisionT_col] == 1, featureX_col],
+        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
+        0)
+
+    test = test.assign(B_prob_0_model=predictions)
+
+    # Invoke the original contraction.
+    FR = contraction(test,
+                     judgeIDJ_col=judgeIDJ_col,
+                     decisionT_col=decisionT_col,
+                     resultY_col=resultY_col,
+                     modelProbS_col="B_prob_0_model",
+                     accRateR_col=accRateR_col,
+                     r=r)
+
+    return FR
+
+
+def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r,
+                            fit_model=True):
+
+    train, test = train_test_split(df, test_size=0.5)
+
+    if fit_model:
+        B_model, predictions = fitPredictiveModel(train[featureX_col],
+                                                  train[resultY_col],
+                                                  test[featureX_col], 0)
+    
+        test = test.assign(B_prob_0_model=predictions)
+
+    test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(test.shape[0] * r))
+
+    return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0]
+
+
+def labeledOutcomesEvaluator(df,
+                             featureX_col,
+                             decisionT_col,
+                             resultY_col,
+                             r,
+                             adjusted=False,
+                             fit_model=True):
+
+    train, test = train_test_split(df, test_size=0.5)
+
+    if fit_model:
+        B_model, predictions = fitPredictiveModel(
+            train.loc[train[decisionT_col] == 1, featureX_col],
+            train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
+            0)
+
+        test = test.assign(B_prob_0_model=predictions)
+
+    test_observed = test.loc[test[decisionT_col] == 1, :]
+
+    test_observed = test_observed.sort_values(by='B_prob_0_model',
+                                              inplace=False,
+                                              ascending=True)
+
+    to_release = int(round(test_observed.shape[0] * r))
+
+    if adjusted:
+        return np.mean(test_observed[resultY_col][0:to_release] == 0)
+
+    return np.sum(
+        test_observed[resultY_col][0:to_release] == 0) / test.shape[0]
+
+
+def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
+                             accRateR_col, r):
+
+    # Get judges with correct leniency as list
+    is_correct_leniency = df[accRateR_col].round(1) == r
+
+    # No judges with correct leniency
+    if np.sum(is_correct_leniency) == 0:
+        return np.nan
+
+    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
+
+    # Released are the people they judged and released, T = 1
+    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
+                  & (df[decisionT_col] == 1)]
+
+    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
+    return np.sum(released[resultY_col] == 0) / correct_leniency_list.shape[0]
+
+
+###################
+    
+# Read in the data
+compas_raw = pd.read_csv(data_path)
+
+# Select columns
+compas = compas_raw[[
+    'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex',
+    'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
+    'two_year_recid', 'c_jail_in', 'c_jail_out'
+]]
+
+# Subset values, see reasons in ProPublica methodology.
+compas = compas.query('days_b_screening_arrest <= 30 and \
+                      days_b_screening_arrest >= -30 and \
+                      is_recid != -1 and \
+                      c_charge_degree != "O"')
+
+# Drop row if score_text is na
+compas = compas[compas.score_text.notnull()]
+
+# Recode recidivism values to fit earlier notation
+# So here result_Y = 1 - is_recid (inverted binary coding).
+compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1)
+
+# Convert string values to dummies, drop first so full rank
+compas_dummy = pd.get_dummies(
+    compas,
+    columns=['c_charge_degree', 'race', 'age_cat', 'sex'],
+    drop_first=True)
+
+compas_dummy.drop(columns=[
+    'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
+    'two_year_recid', 'score_text', 'is_recid'
+],
+    inplace=True)
+
+# Shuffle rows for random judge assignment
+compas_shuffled = compas_dummy.sample(frac=1)
+
+nJudges_M = 9
+
+# Assign judges as evenly as possible
+judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False)
+
+# Assign fixed leniencies from 0.1 to 0.9
+judge_leniency = np.arange(1, 10) / 10
+
+judge_leniency = judge_leniency[judge_ID]
+
+compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID,
+                                         judge_leniency=judge_leniency)
+
+# Sort by judges then probabilities in decreasing order
+# Most dangerous for each judge are at the top.
+compas_shuffled.sort_values(by=["judge_ID", "decile_score"],
+                            ascending=False,
+                            inplace=True)
+
+# Iterate over the data. Subject will be given a negative decision
+# if they are in the top (1-r)*100% of the individuals the judge will judge.
+# I.e. if their within-judge-index is under 1 - acceptance threshold times
+# the number of subjects assigned to each judge they will receive a
+# negative decision.
+compas_shuffled.reset_index(drop=True, inplace=True)
+
+subjects_allocated = compas_shuffled.judge_ID.value_counts()
+
+compas_shuffled['judge_index'] = compas_shuffled.groupby('judge_ID').cumcount()
+
+compas_shuffled['decision_T'] = np.where(
+    compas_shuffled['judge_index'] < (1 - compas_shuffled['judge_leniency']) *
+    subjects_allocated[compas_shuffled['judge_ID']].values, 0, 1)
+
+compas_labeled = compas_shuffled.copy()
+compas_unlabeled = compas_shuffled.copy()
+
+# Hide unobserved
+compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan
+
+# Choose feature_columns
+feature_cols = ~compas_labeled.columns.isin(
+    ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T'])
+
+feature_cols = compas_labeled.columns[feature_cols]
+
+
+####### Draw figures ###########
+
+failure_rates = np.zeros((8, 6))
+failure_sems = np.zeros((8, 6))
+
+f_rate_true = np.zeros((nIter, 8))
+f_rate_label = np.zeros((nIter, 8))
+f_rate_label_adj = np.zeros((nIter, 8))
+f_rate_human = np.zeros((nIter, 8))
+f_rate_cont = np.zeros((nIter, 8))
+f_rate_caus = np.zeros((nIter, 8))
+
+
+# Split data
+train, test = train_test_split(compas_labeled, test_size=0.5)
+
+# Train a logistic regression model
+B_model, predictions = fitPredictiveModel(
+    train.loc[train['decision_T'] == 1, feature_cols],
+    train.loc[train['decision_T'] == 1, 'result_Y'], test[feature_cols], 0)
+
+test = test.assign(B_prob_0_model=predictions)
+
+test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+kk_array = pd.qcut(test['B_prob_0_model'], group_amount, labels=False)
+
+# Find observed values
+observed = test['decision_T'] == 1
+
+# Assign data to the model
+dat = dict(D=1,
+           N_obs=np.sum(observed),
+           N_cens=np.sum(~observed),
+           K=group_amount,
+           sigma_tau=sigma_tau,
+           M=len(set(compas_labeled['judge_ID'])),
+           jj_obs=test.loc[observed, 'judge_ID']+1,
+           jj_cens=test.loc[~observed, 'judge_ID']+1,
+           kk_obs=kk_array[observed]+1,
+           kk_cens=kk_array[~observed]+1,
+           dec_obs=test.loc[observed, 'decision_T'],
+           dec_cens=test.loc[~observed, 'decision_T'],
+           X_obs=test.loc[observed, 'B_prob_0_model'].values.reshape(-1,1),
+           X_cens=test.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1),
+           y_obs=test.loc[observed, 'result_Y'].astype(int))
+
+sm = pystan.StanModel(file=stan_code_file_name)
+
+fit = sm.sampling(data=dat, chains=5, iter=4000, control = dict(adapt_delta=0.9))
+
+pars = fit.extract()
+
+print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
+
+plt.figure(figsize=(15,30))
+
+fit.plot();
+
+plt.savefig(save_name + '_stan_diagnostic_plot')
+
+plt.show()
+plt.close('all')
+
+# Bayes
+
+# Alusta matriisi, rivillä yksi otos posteriorista
+# sarakkeet havaintoja
+y_imp = np.ones((pars['y_est'].shape[0], test.shape[0]))
+
+# Täydennetään havaitsemattomat estimoiduilla
+y_imp[:, ~observed] = 1-pars['y_est']
+
+# Täydennetään havaitut havaituilla
+y_imp[:, observed] = 1-test.loc[observed, 'result_Y']
+
+Rs = np.arange(.1, .9, .1)
+
+to_release_list = np.round(test.shape[0] * Rs).astype(int)
+
+f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan)
+
+for i in range(len(to_release_list)):
+    est_failure_rates = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) / test.shape[0]
+    
+    f_rate_bayes[:, i] = est_failure_rates
+    
+    failure_rates[i, 5] = np.mean(est_failure_rates)    
+        
+for i in range(nIter):
+    
+    print(" [", i, "] ", sep='', end="")
+
+    for r in np.arange(1, 9):
+
+        print(".", end="")
+
+        # True evaluation
+
+        f_rate_true[i, r - 1] = trueEvaluationEvaluator(
+            compas_unlabeled, feature_cols, 'decision_T', 'result_Y', r / 10)
+
+        # Labeled outcomes only
+
+        f_rate_label[i, r - 1] = labeledOutcomesEvaluator(
+            compas_labeled, feature_cols, 'decision_T', 'result_Y', r / 10)
+
+        # Adjusted labeled outcomes
+
+        f_rate_label_adj[i, r - 1] = labeledOutcomesEvaluator(
+            compas_labeled,
+            feature_cols,
+            'decision_T',
+            'result_Y',
+            r / 10,
+            adjusted=True)
+
+        # Human evaluation
+
+        f_rate_human[i, r - 1] = humanEvaluationEvaluator(
+            compas_labeled, 'judge_ID', 'decision_T', 'result_Y',
+            'judge_leniency', r / 10)
+
+        # Contraction
+
+        f_rate_cont[i, r - 1] = contractionEvaluator(
+            compas_labeled, feature_cols, 'judge_ID', 'decision_T', 'result_Y',
+            'judge_leniency', r / 10)
+
+failure_rates[:, 0] = np.mean(f_rate_true, axis=0)
+failure_rates[:, 1] = np.mean(f_rate_label, axis=0)
+failure_rates[:, 2] = np.mean(f_rate_label_adj, axis=0)
+failure_rates[:, 3] = np.mean(f_rate_human, axis=0)
+failure_rates[:, 4] = np.mean(f_rate_cont, axis=0)
+
+failure_sems[:, 0] = scs.sem(f_rate_true, axis=0)
+failure_sems[:, 1] = scs.sem(f_rate_label, axis=0)
+failure_sems[:, 2] = scs.sem(f_rate_label_adj, axis=0)
+failure_sems[:, 3] = scs.sem(f_rate_human, axis=0)
+failure_sems[:, 4] = scs.sem(f_rate_cont, axis=0)
+failure_sems[:, 5] = scs.sem(f_rate_bayes, axis=0, nan_policy='omit')
+
+x_ax = np.arange(0.1, 0.9, 0.1)
+
+labels = [
+    'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
+    'Human evaluation', 'Contraction', 'Potential outcomes'
+]
+colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c']
+
+for i in range(failure_rates.shape[1]):
+    plt.errorbar(x_ax,
+                 failure_rates[:, i],
+                 label=labels[i],
+                 c=colours[i],
+                 yerr=failure_sems[:, i])
+
+plt.title('Failure rate vs. Acceptance rate')
+plt.xlabel('Acceptance rate')
+plt.ylabel('Failure rate')
+plt.legend()
+plt.grid()
+
+plt.savefig(save_name + '_all')
+
+plt.show()
+
+print("\nFailure rates:")
+print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
+
+print("\nMean absolute errors:")
+for i in range(1, failure_rates.shape[1]):
+    print(
+        labels[i].ljust(len(max(labels, key=len))),
+        np.round(
+            np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
+ 
+    
+# Draw diagnostic figures
+f_rates= [f_rate_true, f_rate_label, f_rate_label_adj, f_rate_human, 
+          f_rate_cont, f_rate_bayes]
+
+cols = 2
+rows = np.ceil(len(f_rates) / cols)
+
+plt.figure(figsize=(16, 4.5*rows+1))
+
+ax = plt.subplot(rows, cols, 1)
+x_ax = np.arange(1, 9, 1) / 10
+
+plt.boxplot(f_rates[0], labels=x_ax)
+
+plt.title(labels[0])
+plt.xlabel('Acceptance rate')
+plt.ylabel('Failure rate')
+plt.grid()
+
+for i in range(len(f_rates)):
+    plt.subplot(rows, cols, i + 1, sharey=ax)
+
+    plt.boxplot(f_rates[i], labels=x_ax)
+
+    plt.title(labels[i])
+    plt.xlabel('Acceptance rate')
+    plt.ylabel('Failure rate')
+    plt.grid()
+
+plt.tight_layout()
+plt.subplots_adjust(top=0.89)
+
+title = "COMPAS data set\nFluctuation of failure rate estimates per method"
+
+plt.suptitle(title, y=0.96, weight='bold')
+
+plt.savefig(save_name + '_diagnostic_plot')
+
+plt.show()
\ No newline at end of file
diff --git a/analysis_and_scripts/stan_modelling_theoretic.py b/analysis_and_scripts/stan_modelling_theoretic.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f3c5a98202f1bab666a3b7021b8e06d89ab5d10
--- /dev/null
+++ b/analysis_and_scripts/stan_modelling_theoretic.py
@@ -0,0 +1,1267 @@
+'''
+# Author: Riku Laine
+# Date: 25JUL2019 (start)
+# Project name: Potential outcomes in model evaluation
+# Description: This script creates the figures and results used 
+#              in synthetic data experiments.
+#
+# Parameters:
+# -----------
+# (1) figure_path : file name for saving the created figures.
+# (2) N_sim : Size of simulated data set.
+# (3) M_sim : Number of judges in simulated data, 
+#             N_sim must be divisible by M_sim!
+# (4) which : Which data + outcome analysis should be performed.
+# (5) group_amount : How many groups if Jung-inspired model is used.
+# (6) stan_code_file_name : Name of file containing the stan model code.
+# (7) sigma_tau : Values of prior variance for the Jung-inspired model.
+# 
+# Modifications:
+# --------------
+# 26JUL2019 : RL - Changed add_epsilon default to True
+#                - Corrected the data set used by the causal model (from 
+#                  unlabeled to labeled)
+#                - Corrections to which == 11, coefficients and prints.
+#                - All plot, thinning off.
+#                - Fix one decider with leniency 0.9
+#
+'''
+# Refer to the `notes.tex` file for explanations about the modular framework.
+
+# Imports
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import scipy.stats as scs
+import scipy.special as ssp
+import scipy.integrate as si
+import numpy.random as npr
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+import pystan
+import gc
+
+plt.switch_backend('agg')
+
+import sys
+
+# figure storage name
+figure_path = sys.argv[1]
+
+# Size of simulated data set
+N_sim = int(sys.argv[2])
+
+# Number of judges in simulated data, N_sim must be divisible by M_sim!
+M_sim = int(sys.argv[3])
+
+# Which data + outcome generation should be performed.
+which = int(sys.argv[4])
+
+# How many groups if jung model is used
+group_amount = int(sys.argv[5])
+
+# Name of stan model code file
+stan_code_file_name = sys.argv[6]
+
+# Variance prior
+sigma_tau = float(sys.argv[7])
+
+# Settings
+
+plt.rcParams.update({'font.size': 16})
+plt.rcParams.update({'figure.figsize': (10, 6)})
+
+print("These results have been obtained with the following settings:")
+
+print("Number of observations in the simulated data:", N_sim)
+
+print("Number of judges in the simulated data:", M_sim)
+
+print("Number of groups:", group_amount)
+
+print("Prior for the variances:", sigma_tau)
+
+# Basic functions
+
+
+def inv_logit(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def logit(x):
+    return np.log(x) - np.log(1.0 - x)
+
+
+def inverse_cumulative(x, mu, sigma):
+    '''Compute the inverse of the cumulative distribution of logit-normal
+    distribution at x with parameters mu and sigma (mean and st.dev.).'''
+
+    return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
+
+
+# ## Data generation modules
+
+def bernoulliDGWithoutUnobservables(N_total=50000):
+    '''Generates data | Variables: X, Y | Outcome from Bernoulli'''
+
+    df = pd.DataFrame()
+
+    # Sample feature X from standard Gaussian distribution, N(0, 1).
+    df = df.assign(X=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inv_logit(X)
+    df = df.assign(probabilities_Y=inv_logit(df.X))
+
+    # Draw Y ~ Bernoulli(1 - inv_logit(X))
+    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inv_logit(X)
+    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
+
+    df = df.assign(result_Y=results)
+
+    return df
+
+
+def thresholdDGWithUnobservables(N_total=50000,
+                                 beta_X=1.0,
+                                 beta_Z=1.0,
+                                 beta_W=0.2):
+    '''Generates data | Variables: X, Z, W, Y | Outcome by threshold'''
+
+    df = pd.DataFrame()
+
+    # Sample the variables from standard Gaussian distributions.
+    df = df.assign(X=npr.normal(size=N_total))
+    df = df.assign(Z=npr.normal(size=N_total))
+    df = df.assign(W=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X, Z, W)
+    probabilities_Y = inv_logit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
+
+    df = df.assign(probabilities_Y=probabilities_Y)
+
+    # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise
+    df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1))
+
+    return df
+
+
+def bernoulliDGWithUnobservables(N_total=50000,
+                                beta_X=1.0,
+                                beta_Z=1.0,
+                                beta_W=0.2):
+    '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli'''
+    
+    df = pd.DataFrame()
+
+    # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1).
+    df = df.assign(X=npr.normal(size=N_total))
+    df = df.assign(Z=npr.normal(size=N_total))
+    df = df.assign(W=npr.normal(size=N_total))
+
+    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inv_logit(X)
+    probabilities_Y = inv_logit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
+
+    df = df.assign(probabilities_Y=probabilities_Y)
+
+    # Draw Y from Bernoulli distribution
+    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
+
+    df = df.assign(result_Y=results)
+
+    return df
+
+
+# ## Decider modules
+
+def humanDeciderLakkaraju(df,
+                          featureX_col,
+                          featureZ_col=None,
+                          nJudges_M=100,
+                          beta_X=1,
+                          beta_Z=1,
+                          add_epsilon=True):
+    '''Decider module | Non-independent batch decisions.'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    # Sample acceptance rates uniformly from a closed interval
+    # from 0.1 to 0.9 and round to tenth decimal place.
+    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
+    #            values.
+    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+    acceptance_rates = np.round(acceptance_rates, 10)
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon)
+    else:
+        probabilities_T = inv_logit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Sort by judges then probabilities in decreasing order
+    # Most dangerous for each judge are at the top.
+    df.sort_values(by=["judgeID_J", "probabilities_T"],
+                   ascending=False,
+                   inplace=True)
+
+    # Iterate over the data. Subject will be given a negative decision
+    # if they are in the top (1-r)*100% of the individuals the judge will judge.
+    # I.e. if their within-judge-index is under 1 - acceptance threshold times
+    # the number of subjects assigned to each judge they will receive a
+    # negative decision.
+    df.reset_index(drop=True, inplace=True)
+
+    df['decision_T'] = np.where((df.index.values % nSubjects_N) <
+                                ((1 - df['acceptanceRate_R']) * nSubjects_N),
+                                0, 1)
+
+    df_labeled = df.copy()
+
+    # Hide unobserved
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def bernoulliDecider(df,
+                    featureX_col,
+                    featureZ_col=None,
+                    nJudges_M=100,
+                    beta_X=1,
+                    beta_Z=1,
+                    add_epsilon=True):
+    '''Use X and Z to make a decision with probability 
+    P(T=0|X, Z)=inv_logit(beta_X*X+beta_Z*Z).'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon)
+    else:
+        probabilities_T = inv_logit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Draw T from Bernoulli distribution
+    decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0])
+
+    df = df.assign(decision_T=decisions)
+
+    # Calculate the acceptance rates.
+    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def quantileDecider(df,
+                    featureX_col,
+                    featureZ_col=None,
+                    nJudges_M=100,
+                    beta_X=1,
+                    beta_Z=1,
+                    add_epsilon=True):
+    '''Assign decisions by the value of inverse cumulative distribution function
+    of the logit-normal distribution at leniency r.'''
+    
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    # Sample acceptance rates uniformly from a closed interval
+    # from 0.1 to 0.9 and round to tenth decimal place.
+    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
+    #            values.
+    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+    acceptance_rates = np.round(acceptance_rates, 10)
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    if add_epsilon:
+        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
+    else:
+        epsilon = 0
+    
+    if featureZ_col is None:
+        probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon)
+
+        # Compute the bounds straight from the inverse cumulative.
+        # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2.
+        df = df.assign(bounds=inverse_cumulative(
+            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2)))
+    else:
+        probabilities_T = inv_logit(beta_X * df[featureX_col] +
+                                    beta_Z * df[featureZ_col] + epsilon)
+
+        # Compute the bounds straight from the inverse cumulative.
+        # Assuming X and Z are i.i.d standard Gaussians with variance 1.
+        # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z).
+        df = df.assign(bounds=inverse_cumulative(
+            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2)))
+
+    df = df.assign(probabilities_T=probabilities_T)
+
+    # Assign negative decision if the predicted probability (probabilities_T) is
+    # over the judge's threshold (bounds).
+    df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
+    '''Doesn't use any information about X and Z to make decisions.
+    
+    If use_acceptance_rates is False (default) then all decisions are positive
+    with probabiltiy 0.5. If True, probabilities will be sampled from 
+    U(0.1, 0.9) and rounded to tenth decimal place.'''
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Compute the number of subjects allocated for each judge.
+    nSubjects_N = int(df.shape[0] / nJudges_M)
+
+    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
+    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
+
+    if use_acceptance_rates:
+        # Sample acceptance rates uniformly from a closed interval
+        # from 0.1 to 0.9 and round to tenth decimal place.
+        acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10)
+    else:
+        # No real leniency here -> set to 0.5.
+        acceptance_rates = np.ones(nJudges_M) * 0.5
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
+
+    df = df.assign(
+        decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0]))
+
+    df_labeled = df.copy()
+
+    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
+
+    return df_labeled, df
+
+
+def biasDecider(df,
+                featureX_col,
+                featureZ_col=None,
+                nJudges_M=100,
+                beta_X=1,
+                beta_Z=1,
+                add_epsilon=True):
+    '''
+    Biased decider: If X > 1, then X <- X * 0.75. People with high X, 
+    get more positive decisions as they should.
+    
+    '''
+
+    # If X > 1, then X <- X * 0.75. People with high X, get more positive
+    # decisions as they should
+    df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] *
+                                     0.75, df[featureX_col]))
+
+    # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less
+    # positive decisions as they should
+    df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1,
+                           df.biased_X + 0.5, df.biased_X)
+
+    # Assert that every judge will have the same number of subjects.
+    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
+
+    # Use quantile decider, but judge by the biased X.
+    df_labeled, df = humanDeciderLakkaraju(df,
+                                     featureX_col='biased_X',
+                                     featureZ_col=featureZ_col,
+                                     nJudges_M=nJudges_M,
+                                     beta_X=beta_X,
+                                     beta_Z=beta_Z,
+                                     add_epsilon=add_epsilon)
+
+    return df_labeled, df
+
+
+# ## Evaluator modules
+
+# ### Convenience functions
+
+def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
+    '''
+    Fit a predictive model (default logistic regression) with given training 
+    instances and return probabilities for test instances to obtain a given 
+    class label.
+    
+    Arguments:
+    ----------
+    
+    x_train -- x values of training instances
+    y_train -- y values of training instances
+    x_test -- x values of test instances
+    class_value -- class label for which the probabilities are counted for.
+    model_type -- type of model to be fitted.
+    
+    Returns:
+    --------
+    (1) Trained predictive model
+    (2) Probabilities for given test inputs for given class.
+    '''
+
+    if model_type is None or model_type in ["logistic_regression", "lr"]:
+        # Instantiate the model (using the default parameters)
+        logreg = LogisticRegression(solver='lbfgs')
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            logreg = logreg.fit(x_train, y_train)
+
+        label_probs_logreg = getProbabilityForClass(x_test, logreg,
+                                                    class_value)
+
+        return logreg, label_probs_logreg
+
+    elif model_type in ["random_forest", "rf"]:
+        # Instantiate the model
+        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
+
+        # Check shape and fit the model.
+        if x_train.ndim == 1:
+            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
+        else:
+            forest = forest.fit(x_train, y_train)
+
+        label_probs_forest = getProbabilityForClass(x_test, forest,
+                                                    class_value)
+
+        return forest, label_probs_forest
+
+    elif model_type == "fully_random":
+
+        label_probs = np.ones_like(x_test) / 2
+
+        model_object = lambda x: 0.5
+
+        return model_object, label_probs
+    else:
+        raise ValueError("Invalid model_type!", model_type)
+
+
+def getProbabilityForClass(x, model, class_value):
+    '''
+    Function (wrapper) for obtaining the probability of a class given x and a 
+    predictive model.
+
+    Arguments:
+    -----------
+    x -- individual features, an array of shape (observations, features)
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    Returns:
+    --------
+    (1) The probabilities of given class label for each x.
+    '''
+    if x.ndim == 1:
+        # if x is vector, transform to column matrix.
+        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
+    else:
+        f_values = model.predict_proba(x)
+
+    # Get correct column of predicted class, remove extra dimensions and return.
+    return f_values[:, model.classes_ == class_value].flatten()
+
+
+def cdf(x_0, model, class_value):
+    '''
+    Cumulative distribution function as described above. Integral is 
+    approximated using Simpson's rule for efficiency.
+    
+    Arguments:
+    ----------
+    
+    x_0 -- private features of an instance for which the value of cdf is to be
+        calculated.
+    model -- a trained sklearn model. Predicts probabilities for given x. 
+        Should accept input of shape (observations, features)
+    class_value -- the resulting class to predict (usually 0 or 1).
+
+    '''
+
+    def prediction(x):
+        return getProbabilityForClass(
+            np.array([x]).reshape(-1, 1), model, class_value)
+
+    prediction_x_0 = prediction(x_0)
+
+    x_values = np.linspace(-15, 15, 40000)
+
+    x_preds = prediction(x_values)
+
+    y_values = scs.norm.pdf(x_values)
+
+    results = np.zeros(x_0.shape[0])
+
+    for i in range(x_0.shape[0]):
+
+        y_copy = y_values.copy()
+
+        y_copy[x_preds > prediction_x_0[i]] = 0
+
+        results[i] = si.simps(y_copy, x=x_values)
+
+    return results
+
+
+def bailIndicator(r, y_model, x_train, x_test):
+    '''
+    Indicator function for whether a judge will bail or jail a suspect.
+    Rationale explained above.
+
+    Algorithm:
+    ----------
+
+    (1) Calculate recidivism probabilities from training set with a trained 
+        model and assign them to predictions_train.
+
+    (2) Calculate recidivism probabilities from test set with the trained 
+        model and assign them to predictions_test.
+
+    (3) Construct a quantile function of the probabilities in
+        in predictions_train.
+
+    (4)
+    For pred in predictions_test:
+
+        if pred belongs to a percentile (computed from step (3)) lower than r
+            return True
+        else
+            return False
+
+    Arguments:
+    ----------
+
+    r -- float, acceptance rate, between 0 and 1
+    y_model -- a trained sklearn predictive model to predict the outcome
+    x_train -- private features of the training instances
+    x_test -- private features of the test instances
+
+    Returns:
+    --------
+    (1) Boolean list indicating a bail decision (bail = True) for each 
+        instance in x_test.
+    '''
+
+    predictions_train = getProbabilityForClass(x_train, y_model, 0)
+
+    predictions_test = getProbabilityForClass(x_test, y_model, 0)
+
+    return [
+        scs.percentileofscore(predictions_train, pred, kind='weak') < r
+        for pred in predictions_test
+    ]
+
+
+# ### Contraction algorithm
+# 
+# Below is an implementation of Lakkaraju's team's algorithm presented in 
+# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
+# parameters to be passed to the function are presented in the description.
+
+def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
+                accRateR_col, r):
+    '''
+    This is an implementation of the algorithm presented by Lakkaraju
+    et al. in their paper "The Selective Labels Problem: Evaluating 
+    Algorithmic Predictions in the Presence of Unobservables" (2017).
+
+    Arguments:
+    ----------
+    df -- The (Pandas) data frame containing the data, judge decisions,
+        judge IDs, results and probability scores.
+    judgeIDJ_col -- String, the name of the column containing the judges' IDs
+        in df.
+    decisionT_col -- String, the name of the column containing the judges' decisions
+    resultY_col -- String, the name of the column containing the realization
+    modelProbS_col -- String, the name of the column containing the probability
+        scores from the black-box model B.
+    accRateR_col -- String, the name of the column containing the judges' 
+        acceptance rates
+    r -- Float between 0 and 1, the given acceptance rate.
+
+    Returns:
+    --------
+    (1) The estimated failure rate at acceptance rate r.
+    '''
+    # Get ID of the most lenient judge.
+    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
+
+    # Subset. "D_q is the set of all observations judged by q."
+    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
+
+    # All observations of R_q have observed outcome labels.
+    # "R_q is the set of observations in D_q with observed outcome labels."
+    R_q = D_q[D_q[decisionT_col] == 1].copy()
+
+    # Sort observations in R_q in descending order of confidence scores S and
+    # assign to R_sort_q.
+    # "Observations deemed as high risk by B are at the top of this list"
+    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
+
+    number_to_remove = int(
+        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
+
+    # "R_B is the list of observations assigned to t = 1 by B"
+    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
+
+    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0]
+
+
+# ### Evaluators
+
+def contractionEvaluator(df, featureX_col, judgeIDJ_col, decisionT_col,
+                         resultY_col, accRateR_col, r):
+
+    train, test = train_test_split(df, test_size=0.5)
+
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train[decisionT_col] == 1, featureX_col],
+        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
+        0)
+
+    test = test.assign(B_prob_0_model=predictions)
+
+    # Invoke the original contraction.
+    FR = contraction(test,
+                     judgeIDJ_col=judgeIDJ_col,
+                     decisionT_col=decisionT_col,
+                     resultY_col=resultY_col,
+                     modelProbS_col="B_prob_0_model",
+                     accRateR_col=accRateR_col,
+                     r=r)
+
+    return FR
+
+
+def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r):
+
+    train, test = train_test_split(df, test_size=0.5)
+
+    B_model, predictions = fitPredictiveModel(train[featureX_col],
+                                              train[resultY_col],
+                                              test[featureX_col], 0)
+
+    test = test.assign(B_prob_0_model=predictions)
+
+    test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(test.shape[0] * r))
+
+    return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0]
+
+
+def labeledOutcomesEvaluator(df,
+                             featureX_col,
+                             decisionT_col,
+                             resultY_col,
+                             r,
+                             adjusted=False):
+
+    train, test = train_test_split(df, test_size=0.5)
+
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train[decisionT_col] == 1, featureX_col],
+        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
+        0)
+
+    test = test.assign(B_prob_0_model=predictions)
+
+    test_observed = test.loc[test[decisionT_col] == 1, :]
+
+    test_observed = test_observed.sort_values(by='B_prob_0_model',
+                                              inplace=False,
+                                              ascending=True)
+
+    to_release = int(round(test_observed.shape[0] * r))
+
+    if adjusted:
+        return np.mean(test_observed[resultY_col][0:to_release] == 0)
+
+    return np.sum(
+        test_observed[resultY_col][0:to_release] == 0) / test.shape[0]
+
+
+def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
+                             accRateR_col, r):
+
+    # Get judges with correct leniency as list
+    is_correct_leniency = df[accRateR_col].round(1) == r
+
+    # No judges with correct leniency
+    if np.sum(is_correct_leniency) == 0:
+        return np.nan
+
+    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
+
+    # Released are the people they judged and released, T = 1
+    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
+                  & (df[decisionT_col] == 1)]
+
+    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
+    return np.sum(released[resultY_col] == 0) / correct_leniency_list.shape[0]
+
+
+def causalEvaluator(df, featureX_col, decisionT_col, resultY_col, r):
+
+    train, test = train_test_split(df, test_size=0.5)
+
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train[decisionT_col] == 1, featureX_col],
+        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
+        0)
+
+    test = test.assign(B_prob_0_model=predictions)
+
+    released = cdf(test[featureX_col], B_model, 0) < r
+
+    return np.mean(test.B_prob_0_model * released)
+
+
+def monteCarloEvaluator(df,
+                        featureX_col,
+                        decisionT_col,
+                        resultY_col,
+                        accRateR_col,
+                        r,
+                        mu_X=0,
+                        mu_Z=0,
+                        beta_X=1,
+                        beta_Z=1,
+                        sigma_X=1,
+                        sigma_Z=1):
+
+    # Train the models and assign the predicted probabilities.
+    train, test = train_test_split(df, test_size=0.5)
+
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train[decisionT_col] == 1, featureX_col],
+        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
+        0)
+
+    test = test.assign(B_prob_0_model=predictions)
+
+    # Compute the predicted/assumed decision bounds for all the judges.
+    q_r = inverse_cumulative(x=test[accRateR_col],
+                             mu=mu_X + mu_Z,
+                             sigma=np.sqrt((beta_X * sigma_X)**2 +
+                                           (beta_Z * sigma_Z)**2))
+
+    test = test.assign(bounds=logit(q_r) - test[featureX_col])
+
+    # Compute the expectation of Z when it is known to come from truncated
+    # Gaussian.
+    alphabeta = (test.bounds - mu_Z) / (sigma_Z)
+
+    Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z)  # 1 - cdf(ab)
+
+    # E(Z | Z > a). Expectation of Z if negative decision.
+    exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_
+
+    # E(Z | Z < b). Expectation of Z if positive decision.
+    exp_upper_trunc = mu_Z - (
+        sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta)
+
+    exp_Z = (1 - test[decisionT_col]
+             ) * exp_lower_trunc + test[decisionT_col] * exp_upper_trunc
+
+    # Attach the predicted probability for Y=0 to data.
+    test = test.assign(predicted_Y=inv_logit(test[featureX_col] + exp_Z))
+
+    # Predictions drawn from binomial. (Should fix this.)
+    predictions = npr.binomial(n=1, p=1 - test.predicted_Y, size=test.shape[0])
+
+    test[resultY_col] = np.where(test[decisionT_col] == 0, predictions,
+                                 test[resultY_col])
+
+    test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    to_release = int(round(test.shape[0] * r))
+
+    return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0]
+
+
+# ## Performance comparison
+# 
+# Below we try to replicate the results obtained by Lakkaraju and compare 
+# their model's performance to the one of ours.
+
+def drawDiagnostics(title, save_name, save, f_rates, titles):
+    
+    cols = 2
+    rows = np.ceil(len(f_rates) / cols)
+    
+    plt.figure(figsize=(16, 4.5*rows+1))
+
+    ax = plt.subplot(rows, cols, 1)
+    x_ax = np.arange(1, 9, 1) / 10
+
+    plt.boxplot(f_rates[0], labels=x_ax)
+
+    plt.title(titles[0])
+    plt.xlabel('Acceptance rate')
+    plt.ylabel('Failure rate')
+    plt.grid()
+
+    for i in range(len(f_rates)):
+        plt.subplot(rows, cols, i + 1, sharey=ax)
+
+        plt.boxplot(f_rates[i], labels=x_ax)
+
+        plt.title(titles[i])
+        plt.xlabel('Acceptance rate')
+        plt.ylabel('Failure rate')
+        plt.grid()
+
+    plt.tight_layout()
+    plt.subplots_adjust(top=0.89)
+    plt.suptitle(title, y=0.96, weight='bold')
+
+    if save:
+        plt.savefig(save_name + '_diagnostic_plot')
+
+    plt.show()
+
+
+def perfComp(dgModule, deciderModule, title, save_name, save=True, nIter=50):
+    failure_rates = np.zeros((8, 7))
+    failure_sems = np.zeros((8, 7))
+
+    f_rate_true = np.zeros((nIter, 8))
+    f_rate_label = np.zeros((nIter, 8))
+    f_rate_label_adj = np.zeros((nIter, 8))
+    f_rate_human = np.zeros((nIter, 8))
+    f_rate_cont = np.zeros((nIter, 8))
+    f_rate_caus = np.zeros((nIter, 8))
+
+    # Create data
+    df = dgModule()
+
+    # Decicions
+    df_labeled, df_unlabeled = deciderModule(df)
+
+    # Split data
+    train, test = train_test_split(df_labeled, test_size=0.5)
+
+    # Train model
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train['decision_T'] == 1, 'X'],
+        train.loc[train['decision_T'] == 1, 'result_Y'], test['X'], 0)
+
+    test = test.assign(B_prob_0_model=predictions)
+
+    test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+
+    kk_array = pd.qcut(test['B_prob_0_model'], group_amount, labels=False)
+
+    # Find observed values
+    observed = test['decision_T'] == 1
+
+    # Assign data to the model
+    dat = dict(D=1,
+               N_obs=np.sum(observed),
+               N_cens=np.sum(~observed),
+               K=group_amount,
+               sigma_tau=sigma_tau,
+               M=len(set(df_unlabeled['judgeID_J'])),
+               jj_obs=test.loc[observed, 'judgeID_J']+1,
+               jj_cens=test.loc[~observed, 'judgeID_J']+1,
+               kk_obs=kk_array[observed]+1,
+               kk_cens=kk_array[~observed]+1,
+               dec_obs=test.loc[observed, 'decision_T'],
+               dec_cens=test.loc[~observed, 'decision_T'],
+               X_obs=test.loc[observed, 'B_prob_0_model'].values.reshape(-1,1),
+               X_cens=test.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1),
+               y_obs=test.loc[observed, 'result_Y'].astype(int))
+
+    fit = sm.sampling(data=dat, chains=5, iter=4000, control = dict(adapt_delta=0.9))
+
+    pars = fit.extract()
+
+    plt.figure(figsize=(15,30))
+
+    fit.plot();
+
+    if save:
+        plt.savefig(save_name + '_stan_diagnostic_plot')
+    
+    plt.show()
+    plt.close('all')
+
+    if save:
+        print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
+
+    # Bayes
+    
+    # Alusta matriisi, rivillä yksi otos posteriorista
+    # sarakkeet havaintoja
+    y_imp = np.ones((pars['y_est'].shape[0], test.shape[0]))
+    
+    # Täydennetään havaitsemattomat estimoiduilla
+    y_imp[:, ~observed] = 1-pars['y_est']
+    
+    # Täydennetään havaitut havaituilla
+    y_imp[:, observed] = 1-test.loc[observed, 'result_Y']
+
+    Rs = np.arange(.1, .9, .1)
+    
+    to_release_list = np.round(test.shape[0] * Rs).astype(int)
+    
+    f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan)
+    
+    for i in range(len(to_release_list)):
+        est_failure_rates = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) / test.shape[0]
+        
+        f_rate_bayes[:, i] = est_failure_rates
+        
+        failure_rates[i, 6] = np.mean(est_failure_rates)    
+            
+    for i in range(nIter):
+        
+        print(" [", i, "] ", sep='', end="")
+
+        for r in np.arange(1, 9):
+
+            print(".", end="")
+
+            # True evaluation
+
+            f_rate_true[i, r - 1] = trueEvaluationEvaluator(
+                df_unlabeled, 'X', 'decision_T', 'result_Y', r / 10)
+
+            # Labeled outcomes only
+
+            f_rate_label[i, r - 1] = labeledOutcomesEvaluator(
+                df_labeled, 'X', 'decision_T', 'result_Y', r / 10)
+
+            # Adjusted labeled outcomes
+
+            f_rate_label_adj[i, r - 1] = labeledOutcomesEvaluator(
+                df_labeled,
+                'X',
+                'decision_T',
+                'result_Y',
+                r / 10,
+                adjusted=True)
+
+            # Human evaluation
+
+            f_rate_human[i, r - 1] = humanEvaluationEvaluator(
+                df_labeled, 'judgeID_J', 'decision_T', 'result_Y',
+                'acceptanceRate_R', r / 10)
+
+            # Contraction
+
+            f_rate_cont[i, r - 1] = contractionEvaluator(
+                df_labeled, 'X', 'judgeID_J', 'decision_T', 'result_Y',
+                'acceptanceRate_R', r / 10)
+
+            # Causal model - analytic solution
+
+            f_rate_caus[i, r - 1] = monteCarloEvaluator(
+                df_labeled, 'X', 'decision_T', 'result_Y', 'acceptanceRate_R',
+                r / 10)
+
+    failure_rates[:, 0] = np.mean(f_rate_true, axis=0)
+    failure_rates[:, 1] = np.mean(f_rate_label, axis=0)
+    failure_rates[:, 2] = np.mean(f_rate_label_adj, axis=0)
+    failure_rates[:, 3] = np.mean(f_rate_human, axis=0)
+    failure_rates[:, 4] = np.mean(f_rate_cont, axis=0)
+    failure_rates[:, 5] = np.mean(f_rate_caus, axis=0)
+    #failure_rates[:, 6] = f_rate_bayes
+
+    failure_sems[:, 0] = scs.sem(f_rate_true, axis=0)
+    failure_sems[:, 1] = scs.sem(f_rate_label, axis=0)
+    failure_sems[:, 2] = scs.sem(f_rate_label_adj, axis=0)
+    failure_sems[:, 3] = scs.sem(f_rate_human, axis=0)
+    failure_sems[:, 4] = scs.sem(f_rate_cont, axis=0)
+    failure_sems[:, 5] = scs.sem(f_rate_caus, axis=0)
+    failure_sems[:, 6] = scs.sem(f_rate_bayes, axis=0, nan_policy='omit')
+
+    x_ax = np.arange(0.1, 0.9, 0.1)
+
+    labels = [
+        'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
+        'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes'
+    ]
+    colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c']
+
+    for i in range(failure_rates.shape[1]):
+        plt.errorbar(x_ax,
+                     failure_rates[:, i],
+                     label=labels[i],
+                     c=colours[i],
+                     yerr=failure_sems[:, i])
+
+    plt.title('Failure rate vs. Acceptance rate')
+    plt.xlabel('Acceptance rate')
+    plt.ylabel('Failure rate')
+    plt.legend()
+    plt.grid()
+    
+    if save: 
+        plt.savefig(save_name + '_all')
+    
+    plt.show()
+
+    print("\nFailure rates:")
+    print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
+    
+    print("\nMean absolute errors:")
+    for i in range(1, failure_rates.shape[1]):
+        print(
+            labels[i].ljust(len(max(labels, key=len))),
+            np.round(
+                np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
+
+    drawDiagnostics(title=title,
+                    save_name=save_name,
+                    save=save,
+                    f_rates=[
+                        f_rate_true, f_rate_label, f_rate_label_adj,
+                        f_rate_human, f_rate_cont, f_rate_caus, f_rate_bayes
+                    ],
+                    titles=labels)
+
+
+sm = pystan.StanModel(file=stan_code_file_name)
+
+if which == 1:
+    print("Without unobservables (Bernoulli + independent decisions)")
+
+    dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z"
+    )
+
+gc.collect()
+plt.close('all')
+
+print("With unobservables in the data")
+
+if which == 2:
+    print("\nBernoulli + independent decisions")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Bernoulli + independent decisions, with unobservables",
+        figure_path + "sl_bernoulli_independent_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 3:
+    print("\nThreshold rule + independent decisions")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Threshold rule + independent decisions, with unobservables",
+        figure_path + "sl_threshold_independent_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 4:
+    print("\nBernoulli + non-independent (batch) decisions")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Bernoulli + non-independent decisions, with unobservables",
+        figure_path + "sl_bernoulli_batch_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 5:
+    print("\nThreshold rule + non-independent (batch) decisions")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations \n" +
+        "Threshold rule + non-independent decisions, with unobservables",
+        figure_path + "sl_threshold_batch_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 6:
+    print("\nRandom decider")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: randomDecider(
+        x, nJudges_M=M_sim, use_acceptance_rates=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + random decider with leniency and unobservables",
+        figure_path + "sl_random_decider_with_Z",
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 7:
+    print("\nBiased decider")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + biased decider with leniency and unobservables",
+        figure_path + "sl_biased_decider_with_Z",
+    )
+
+
+if which == 8:
+    print("\nBad judge")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + 'bad' decider with leniency and unobservables",
+        figure_path + "sl_bad_decider_with_Z"
+    )
+
+gc.collect()
+plt.close('all')
+
+if which == 9:
+    print("\nBernoulli + Bernoulli")
+
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Bernoulli + Bernoulli",
+        figure_path + "sl_bernoulli_bernoulli_with_Z",
+    )
+    
+if which == 10:
+    print("\nBeta_Z = 3, Threshold + batch")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Beta_Z = 3, threshold + batch",
+        figure_path + "sl_threshold_batch_beta_Z_3_with_Z",
+    )
+    
+if which == 11:
+    print("\nBeta_Z = 5, Threshold + batch")
+
+    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0)
+
+    decider = lambda x: humanDeciderLakkaraju(
+        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Beta_Z = 5, threshold + batch",
+        figure_path + "sl_threshold_batch_beta_Z_5_with_Z",
+    )
\ No newline at end of file