From 02a7d3f5ecef62d8b7ffa059655093b8a682bf86 Mon Sep 17 00:00:00 2001
From: Riku-Laine <28960190+Riku-Laine@users.noreply.github.com>
Date: Mon, 12 Aug 2019 15:24:55 +0300
Subject: [PATCH] Scripts added and renamed, README for scripts

---
 analysis_and_scripts/README.md                |   12 +
 analysis_and_scripts/code_full.stan           |   73 --
 .../code_linear_dependency.stan               |  101 ++
 ...ed.stan => code_nonlinear_dependency.stan} |   19 +-
 analysis_and_scripts/parallel_model_job.job   |   12 +
 .../stan_modelling_empirical.py               |  472 ++++---
 ...modelling_empirical_SEfixed_usefeatures.py |  532 --------
 ...delling_empirical_SEfixed_useprediction.py |  532 --------
 .../stan_modelling_theoretic.py               |  916 +++++++-------
 .../stan_modelling_theoretic_SEfixed_useX.py  | 1101 -----------------
 ...delling_theoretic_SEfixed_useprediction.py | 1101 -----------------
 11 files changed, 790 insertions(+), 4081 deletions(-)
 create mode 100644 analysis_and_scripts/README.md
 delete mode 100644 analysis_and_scripts/code_full.stan
 create mode 100644 analysis_and_scripts/code_linear_dependency.stan
 rename analysis_and_scripts/{code_jung_binarized.stan => code_nonlinear_dependency.stan} (84%)
 create mode 100644 analysis_and_scripts/parallel_model_job.job
 delete mode 100644 analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py
 delete mode 100644 analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py
 delete mode 100644 analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py
 delete mode 100644 analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py

diff --git a/analysis_and_scripts/README.md b/analysis_and_scripts/README.md
new file mode 100644
index 0000000..9624a6b
--- /dev/null
+++ b/analysis_and_scripts/README.md
@@ -0,0 +1,12 @@
+# Files in this folder
+
+The following files are in this folder:
+* Jupyter notebooks are the old analysis files.
+* notes.tex contains much of the research done during the summer.
+* Stan files: (Stan is a language for Bayesian modelling)
+	* code_linear_dependency - Code for the model with linear conncetion between the explanatory and dependent variable. 
+	* code_nonlinear_dependency - Code for model with groups. See Jung's arXiv preprint for more info.
+* Stan_modelling_* python files are for creating the figures. Empirical does the analysis for the COMPAS data and theoretic for the synthetic data. See parameters from docstrings.
+* truth_tables.py is an old script from the original repository.
+
+Models can be run on the cluster by running the batch file (suffix .job) with the sbatch command.
\ No newline at end of file
diff --git a/analysis_and_scripts/code_full.stan b/analysis_and_scripts/code_full.stan
deleted file mode 100644
index 5d484a3..0000000
--- a/analysis_and_scripts/code_full.stan
+++ /dev/null
@@ -1,73 +0,0 @@
-data {
-  int<lower=1> D;
-  int<lower=0> N_obs;
-  int<lower=0> N_cens;
-  int<lower=0> M;
-  real<lower=0> sigma_tau;
-  int<lower=1, upper=M> jj_obs[N_obs]; // judge_ID
-  int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID
-  int<lower=0, upper=1> dec_obs[N_obs];
-  int<lower=0, upper=1> dec_cens[N_cens];
-  row_vector[D] X_obs[N_obs];
-  row_vector[D] X_cens[N_cens];
-  int<lower=0, upper=1> y_obs[N_obs];
-}
-
-parameters {
-  vector[N_obs] Z_obs;
-  vector[N_cens] Z_cens;
-  real alpha_T[M];
-  vector[D] beta_XT_raw;
-  vector[D] beta_XY_raw;
-  real<lower=0> beta_ZT_raw; // jungimainen oletus latentin pos. vaik.
-  real<lower=0> beta_ZY_raw; // jungimainen oletus latentin pos. vaik.
-  
-  real<lower=0> tau_XT;
-  real<lower=0> tau_XY;
-  real<lower=0> tau_ZT;
-  real<lower=0> tau_ZY;
-
-}
-
-transformed parameters {
-  vector[D] beta_XT;
-  vector[D] beta_XY;
-  real<lower=0> beta_ZT; // jungimainen oletus latentin pos. vaik.
-  real<lower=0> beta_ZY; // jungimainen oletus latentin pos. vaik.
-
-  beta_XT = tau_XT * beta_XT_raw;
-  beta_XY = tau_XY * beta_XY_raw;
-  beta_ZT = tau_ZT * beta_ZT_raw;
-  beta_ZY = tau_ZY * beta_ZY_raw;
-}
-
-model {
-  Z_obs ~ normal(0, 1);
-  Z_cens ~ normal(0, 1);
-  
-  tau_XT ~ normal(0, sigma_tau);
-  tau_XY ~ normal(0, sigma_tau);
-  tau_ZT ~ normal(0, sigma_tau);
-  tau_ZY ~ normal(0, sigma_tau);
-  
-  beta_XT_raw ~ normal(0, 1);
-  beta_XY_raw ~ normal(0, 1);
-  beta_ZT_raw ~ normal(0, 1);
-  beta_ZY_raw ~ normal(0, 1);
-  
-  for(i in 1:N_obs){
-    dec_obs[i] ~ bernoulli_logit(alpha_T[jj_obs[i]] + X_obs[i] * beta_XT + beta_ZT * Z_obs[i]);
-    y_obs[i] ~ bernoulli_logit(X_obs[i] * beta_XY + beta_ZY * Z_obs[i]);
-  }
-  
-  for(i in 1:N_cens)
-    dec_cens[i] ~ bernoulli_logit(alpha_T[jj_cens[i]] + X_cens[i] * beta_XT + beta_ZT * Z_cens[i]);
-}
-
-generated quantities {
-  int<lower=0, upper=1> y_est[N_cens];
-  
-  for(i in 1:N_cens){
-    y_est[i] = bernoulli_logit_rng(X_cens[i] * beta_XY + beta_ZY * Z_cens[i]);
-  }
-}
diff --git a/analysis_and_scripts/code_linear_dependency.stan b/analysis_and_scripts/code_linear_dependency.stan
new file mode 100644
index 0000000..e177162
--- /dev/null
+++ b/analysis_and_scripts/code_linear_dependency.stan
@@ -0,0 +1,101 @@
+data {
+  int<lower=1> D; // Dimensions of the features an coefficient vectors
+  int<lower=0> N_obs;  // Number of "observed observations" (with T = 1)
+  int<lower=0> N_cens; // Number of "censored observations" (with T = 0)
+  int<lower=0> M; // Number of judges
+  int<lower=1, upper=M> jj_obs[N_obs];   // judge_ID array
+  int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID array
+  int<lower=0, upper=1> dec_obs[N_obs];   // Decisions for the observed observations
+  int<lower=0, upper=1> dec_cens[N_cens]; // Decisions for the censored observations
+  row_vector[D] X_obs[N_obs];   // Features of the observed observations 
+  row_vector[D] X_cens[N_cens]; // Features of the censored observations 
+  int<lower=0, upper=1> y_obs[N_obs]; // Outcomes of the observed observations
+}
+
+parameters {
+  // Latent variable
+  vector[N_obs] Z_obs;
+  vector[N_cens] Z_cens;
+  
+  // Intercepts
+  real alpha_T[M];
+  real alpha_Y;
+
+  // Temporary variables to compute the coefficients
+  vector[D] a_XT;
+  vector[D] a_XY;
+  real<lower=0> a_ZT; // Presume latent variable has a positive coefficient.
+  real<lower=0> a_ZY;
+
+  real<lower=0> tau_XT;
+  real<lower=0> tau_XY;
+  real<lower=0> tau_ZT;
+  real<lower=0> tau_ZY;
+
+}
+
+transformed parameters {
+  
+  // Coefficients
+  vector[D] beta_XT;
+  vector[D] beta_XY;
+  real<lower=0> beta_ZT; // Presume latent variable has a positive coefficient.
+  real<lower=0> beta_ZY;
+
+  beta_XT = a_XT / sqrt(tau_XT);
+  beta_XY = a_XY / sqrt(tau_XY);
+  beta_ZT = a_ZT / sqrt(tau_ZT);
+  beta_ZY = a_ZY / sqrt(tau_ZY);
+}
+
+model {
+  // Latent variable
+  Z_obs ~ normal(0, 1);
+  Z_cens ~ normal(0, 1);
+  
+  // Intercepts
+  alpha_T ~ normal(0, 5);
+  alpha_Y ~ normal(0, 5);
+  
+  // According to 
+  // https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations
+  // section "Prior for the regression coefficients in logistic regression
+  // (non-sparse case)" a good prior is Student's with parameters 3>nu>7, 
+  // 0 and 1.
+  
+  // Also, according to
+  // https://mc-stan.org/docs/2_19/stan-users-guide/reparameterization-section.html
+  // reparametrizing beta ~ student_t(nu, 0, 1) can be done by sampling
+  // a ~ N(0, 1) and tau ~ gamma(nu/2, nu/2) so then beta = a * tau^(-1/2).
+  // Reparametrizing is done to achieve better mixing.
+
+  a_XT ~ normal(0, 1);
+  a_XY ~ normal(0, 1);
+  a_ZT ~ normal(0, 1);
+  a_ZY ~ normal(0, 1);
+
+  // nu = 5 -> nu/2 = 2.5
+  tau_XT ~ gamma(2.5, 2.5);
+  tau_XY ~ gamma(2.5, 2.5);
+  tau_ZT ~ gamma(2.5, 2.5);
+  tau_ZY ~ gamma(2.5, 2.5);
+  
+  // Compute the regressions for the observed observations
+  for(i in 1:N_obs){
+    dec_obs[i] ~ bernoulli_logit(alpha_T[jj_obs[i]] + X_obs[i] * beta_XT + beta_ZT * Z_obs[i]);
+    y_obs[i] ~ bernoulli_logit(alpha_Y + X_obs[i] * beta_XY + beta_ZY * Z_obs[i]);
+  }
+
+  // Compute the regression for the censored observations
+  for(i in 1:N_cens)
+    dec_cens[i] ~ bernoulli_logit(alpha_T[jj_cens[i]] + X_cens[i] * beta_XT + beta_ZT * Z_cens[i]);
+}
+
+generated quantities {
+  int<lower=0, upper=1> y_est[N_cens];
+  
+  // Generate a draw from the posterior predictive.
+  for(i in 1:N_cens){
+    y_est[i] = bernoulli_logit_rng(alpha_Y + X_cens[i] * beta_XY + beta_ZY * Z_cens[i]);
+  }
+}
diff --git a/analysis_and_scripts/code_jung_binarized.stan b/analysis_and_scripts/code_nonlinear_dependency.stan
similarity index 84%
rename from analysis_and_scripts/code_jung_binarized.stan
rename to analysis_and_scripts/code_nonlinear_dependency.stan
index 259293a..1e6a42e 100644
--- a/analysis_and_scripts/code_jung_binarized.stan
+++ b/analysis_and_scripts/code_nonlinear_dependency.stan
@@ -4,7 +4,7 @@ data {
   int<lower=0> N_cens; // Number of "censored observations" (with T = 0)
   int<lower=1> M; // Number of judges
   int<lower=1> K; // Number of groups
-  real<lower=0> sigma_tau;
+  real<lower=0> sigma_tau; // Prior for the variance parameters.
   int<lower=1, upper=M> jj_obs[N_obs];   // judge_ID
   int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID
   int<lower=1, upper=K> kk_obs[N_obs];   // Grouping
@@ -17,15 +17,19 @@ data {
 }
 
 parameters {
+  // Latent variable
   vector[N_obs] Z_obs;
   vector[N_cens] Z_cens;
+  
   real alpha_T[M];  // Judge-specific intercepts
   
+  // Coefficients
   vector[D] beta_XT_raw[K]; 
   vector[D] beta_XY_raw[K];
   
-  vector<lower=0>[K] beta_ZT_raw; // Coefficient for the latent variable.
-  vector<lower=0>[K] beta_ZY_raw; // Coefficient for the latent variable.
+  // Coefficient for the latent variable presumed to be positive
+  vector<lower=0>[K] beta_ZT_raw;
+  vector<lower=0>[K] beta_ZY_raw;
   
   real<lower=0> tau_XT;
   real<lower=0> tau_XY;
@@ -34,6 +38,11 @@ parameters {
 }
 
 transformed parameters{
+  // Here we transform the _raw parameters to be random-walk
+  // or cumulative random-walk. See Jung's arXiv paper for more
+  // information. We extended this by specifying that judges have
+  // separate intercepts.
+  
   vector[D] beta_XT[K]; 
   vector[D] beta_XY[K];
 
@@ -47,7 +56,9 @@ transformed parameters{
     beta_XT[1] = beta_XT_raw[1];
     beta_XY[1] = beta_XY_raw[1];
     
-    for(i in 2:K){ // random walk prior here
+    for(i in 2:K){ 
+      // random walk prior, value of next group's coefficient
+      // depends on the previous.
       beta_XT[i] = tau_XT * beta_XT_raw[i-1]; // ith group
       beta_XY[i] = tau_XY * beta_XY_raw[i-1];
     }
diff --git a/analysis_and_scripts/parallel_model_job.job b/analysis_and_scripts/parallel_model_job.job
new file mode 100644
index 0000000..49cba7d
--- /dev/null
+++ b/analysis_and_scripts/parallel_model_job.job
@@ -0,0 +1,12 @@
+#!/bin/bash
+#SBATCH --job-name=sl_stan_model_parallel
+#SBATCH --workdir=/wrk/users/rikulain/
+#SBATCH -c 5
+#SBATCH -t 30:00:00
+#SBATCH --mem=45G
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=riku.laine@helsinki.fi
+
+srun hostname
+
+time python /proj/rikulain/stan_modelling_theoretic.py "/wrk/users/rikulain/figures/results_${1}_" 25000 50 $1 1 /proj/rikulain/code_full.stan 1
diff --git a/analysis_and_scripts/stan_modelling_empirical.py b/analysis_and_scripts/stan_modelling_empirical.py
index 6ff23e3..9468f14 100644
--- a/analysis_and_scripts/stan_modelling_empirical.py
+++ b/analysis_and_scripts/stan_modelling_empirical.py
@@ -2,66 +2,59 @@
 # -*- coding: utf-8 -*-
 """
 # Author: Riku Laine
-# Date: 26JUL2019
+# Date: 12AUG2019
 # Project name: Potential outcomes in model evaluation
 # Description: This script creates the figures and results used 
-#              in empirical data experiments.
+#              in empirical data experiments with COMPAS data.
 #
 # Parameters:
 # -----------
-# (1) figure_path : file name for saving the created figures.
-# (2) nIter : Number of train test splits to perform on the data.
-# (3) group_amount : How many groups if Jung-inspired model is used.
-# (4) stan_code_file_name : Name of file containing the stan model code.
-# (5) sigma_tau : Values of prior variance for the Jung-inspired model.
-# (6) data_path : File of compas data.
-
+# (1) save_name : file path and prefix for saving the created figures.
+# (2) group_amount : How many groups if the non-linear model is used.
+# (3) stan_code_file_name : Name of file containing the stan model code.
+# (4) sigma_tau : Values of prior variance for the Jung-inspired model.
+# (5) data_path : Path to data file.
 """
 
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
-import scipy.stats as scs
 import scipy.special as ssp
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
 import pystan
+import gc
 
 plt.switch_backend('agg')
 
+plt.rcParams.update({'font.size': 16})
+plt.rcParams.update({'figure.figsize': (10, 6)})
+
 import sys
 
 # figure storage name
-figure_path = sys.argv[1]
-
-# Iterations
-nIter = int(sys.argv[2])
+save_name = sys.argv[1] + "sl_compas"
 
 # How many groups if jung model is used
-group_amount = int(sys.argv[3])
+group_amount = int(sys.argv[2])
 
 # Name of stan model code file
-stan_code_file_name = sys.argv[4]
+stan_code_file_name = sys.argv[3]
 
 # Variance prior
-sigma_tau = float(sys.argv[5])
+sigma_tau = float(sys.argv[4])
 
 # figure storage name
-data_path = sys.argv[6]
+data_path = sys.argv[5]
 
 # Prefix for the figures and log files
 
 print("These results have been obtained with the following settings:")
 
-print("Number of train test splits :", nIter)
-
 print("Number of groups:", group_amount)
 
 print("Prior for the variances:", sigma_tau)
 
-save_name = "sl_compas"
-
 def inv_logit(x):
     return 1.0 / (1.0 + np.exp(-1.0 * x))
 
@@ -76,6 +69,11 @@ def inverse_cumulative(x, mu, sigma):
 
     return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
 
+def standardError(p, n):
+    denominator = p * (1 - p)
+    
+    return np.sqrt(denominator / n)
+
 ##########################
 
 # ## Evaluator modules
@@ -207,6 +205,26 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
     # Subset. "D_q is the set of all observations judged by q."
     D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
 
+    ##### Agreement rate computation begins #######
+    
+    max_leniency = max(df[accRateR_col])
+    
+    # Sort D_q
+    # "Observations deemed as high risk by B are at the top of this list"
+    D_sort_q = D_q.sort_values(by=modelProbS_col, ascending=False)
+    
+    # The agreement rate is now computed as the percentage of all the positive
+    # decisions given by q in the 1-r % of the most dangerous subjects.
+    all_negatives = np.sum(D_sort_q[decisionT_col] == 0)
+    
+    n_most_dangerous = int(round(D_q.shape[0] * (1-max_leniency)))
+    
+    agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous])
+
+    print("The agreement rate of contraction is:", agreed_decisions/all_negatives)
+    
+    ##### Agreement rate computation ends #######
+    
     # All observations of R_q have observed outcome labels.
     # "R_q is the set of observations in D_q with observed outcome labels."
     R_q = D_q[D_q[decisionT_col] == 1].copy()
@@ -222,109 +240,42 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
     # "R_B is the list of observations assigned to t = 1 by B"
     R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
 
-    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0]
+    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
 
 
 # ### Evaluators
 
-def contractionEvaluator(df, featureX_col, judgeIDJ_col, decisionT_col,
-                         resultY_col, accRateR_col, r):
-
-    train, test = train_test_split(df, test_size=0.5)
-
-    B_model, predictions = fitPredictiveModel(
-        train.loc[train[decisionT_col] == 1, featureX_col],
-        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
-        0)
-
-    test = test.assign(B_prob_0_model=predictions)
-
-    # Invoke the original contraction.
-    FR = contraction(test,
-                     judgeIDJ_col=judgeIDJ_col,
-                     decisionT_col=decisionT_col,
-                     resultY_col=resultY_col,
-                     modelProbS_col="B_prob_0_model",
-                     accRateR_col=accRateR_col,
-                     r=r)
-
-    return FR
-
+def trueEvaluationEvaluator(df, resultY_col, r):
 
-def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r,
-                            fit_model=True):
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
 
-    train, test = train_test_split(df, test_size=0.5)
-
-    if fit_model:
-        B_model, predictions = fitPredictiveModel(train[featureX_col],
-                                                  train[resultY_col],
-                                                  test[featureX_col], 0)
+    to_release = int(round(df.shape[0] * r))
     
-        test = test.assign(B_prob_0_model=predictions)
-
-    test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(test.shape[0] * r))
-
-    return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0]
+    failed = df[resultY_col][0:to_release] == 0
 
+    return np.sum(failed) / df.shape[0], df.shape[0]
 
-def labeledOutcomesEvaluator(df,
-                             featureX_col,
-                             decisionT_col,
-                             resultY_col,
-                             r,
-                             adjusted=False,
-                             fit_model=True):
 
-    train, test = train_test_split(df, test_size=0.5)
+def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False):
 
-    if fit_model:
-        B_model, predictions = fitPredictiveModel(
-            train.loc[train[decisionT_col] == 1, featureX_col],
-            train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
-            0)
+    df_observed = df.loc[df[decisionT_col] == 1, :]
 
-        test = test.assign(B_prob_0_model=predictions)
-
-    test_observed = test.loc[test[decisionT_col] == 1, :]
-
-    test_observed = test_observed.sort_values(by='B_prob_0_model',
+    df_observed = df_observed.sort_values(by='B_prob_0_model',
                                               inplace=False,
                                               ascending=True)
 
-    to_release = int(round(test_observed.shape[0] * r))
+    to_release = int(round(df_observed.shape[0] * r))
+    
+    failed = df_observed[resultY_col][0:to_release] == 0
 
     if adjusted:
-        return np.mean(test_observed[resultY_col][0:to_release] == 0)
+        return np.mean(failed), df.shape[0]
 
-    return np.sum(
-        test_observed[resultY_col][0:to_release] == 0) / test.shape[0]
+    return np.sum(failed) / df.shape[0], df.shape[0]
 
 
-def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
-                             accRateR_col, r):
+################### Data preprocessing ######################
 
-    # Get judges with correct leniency as list
-    is_correct_leniency = df[accRateR_col].round(1) == r
-
-    # No judges with correct leniency
-    if np.sum(is_correct_leniency) == 0:
-        return np.nan
-
-    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
-
-    # Released are the people they judged and released, T = 1
-    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
-                  & (df[decisionT_col] == 1)]
-
-    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
-    return np.sum(released[resultY_col] == 0) / correct_leniency_list.shape[0]
-
-
-###################
-    
 # Read in the data
 compas_raw = pd.read_csv(data_path)
 
@@ -348,7 +299,7 @@ compas = compas[compas.score_text.notnull()]
 # So here result_Y = 1 - is_recid (inverted binary coding).
 compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1)
 
-# Convert string values to dummies, drop first so full rank
+# Convert string values to dummies, drop first to keep full rank.
 compas_dummy = pd.get_dummies(
     compas,
     columns=['c_charge_degree', 'race', 'age_cat', 'sex'],
@@ -363,13 +314,13 @@ compas_dummy.drop(columns=[
 # Shuffle rows for random judge assignment
 compas_shuffled = compas_dummy.sample(frac=1)
 
-nJudges_M = 9
+nJudges_M = 10
 
 # Assign judges as evenly as possible
 judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False)
 
 # Assign fixed leniencies from 0.1 to 0.9
-judge_leniency = np.arange(1, 10) / 10
+judge_leniency = np.array([0.1, 0.3, 0.5, 0.7, 0.9, 0.1, 0.3, 0.5, 0.7, 0.9])
 
 judge_leniency = judge_leniency[judge_ID]
 
@@ -412,162 +363,187 @@ feature_cols = compas_labeled.columns[feature_cols]
 
 ####### Draw figures ###########
 
-failure_rates = np.zeros((8, 6))
-failure_sems = np.zeros((8, 6))
-
-f_rate_true = np.zeros((nIter, 8))
-f_rate_label = np.zeros((nIter, 8))
-f_rate_label_adj = np.zeros((nIter, 8))
-f_rate_human = np.zeros((nIter, 8))
-f_rate_cont = np.zeros((nIter, 8))
-f_rate_caus = np.zeros((nIter, 8))
-
-
-# Split data
-train, test = train_test_split(compas_labeled, test_size=0.5)
-
-# Train a logistic regression model
-B_model, predictions = fitPredictiveModel(
-    train.loc[train['decision_T'] == 1, feature_cols],
-    train.loc[train['decision_T'] == 1, 'result_Y'], test[feature_cols], 0)
-
-test = test.assign(B_prob_0_model=predictions)
-
-test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+def drawDiagnostics(title, save_name, f_rates, titles):
+    
+    cols = 2
+    rows = np.ceil(len(f_rates) / cols)
+    
+    plt.figure(figsize=(16, 4.5*rows+1))
 
-kk_array = pd.qcut(test['B_prob_0_model'], group_amount, labels=False)
+    ax = plt.subplot(rows, cols, 1)
+    x_ax = np.arange(1, 9, 1) / 10
 
-# Find observed values
-observed = test['decision_T'] == 1
+    plt.boxplot(f_rates[0], labels=x_ax)
 
-# Assign data to the model
-dat = dict(D=1,
-           N_obs=np.sum(observed),
-           N_cens=np.sum(~observed),
-           K=group_amount,
-           sigma_tau=sigma_tau,
-           M=len(set(compas_labeled['judge_ID'])),
-           jj_obs=test.loc[observed, 'judge_ID']+1,
-           jj_cens=test.loc[~observed, 'judge_ID']+1,
-           kk_obs=kk_array[observed]+1,
-           kk_cens=kk_array[~observed]+1,
-           dec_obs=test.loc[observed, 'decision_T'],
-           dec_cens=test.loc[~observed, 'decision_T'],
-           X_obs=test.loc[observed, 'B_prob_0_model'].values.reshape(-1,1),
-           X_cens=test.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1),
-           y_obs=test.loc[observed, 'result_Y'].astype(int))
+    plt.title(titles[0])
+    plt.xlabel('Acceptance rate')
+    plt.ylabel('Failure rate')
+    plt.grid()
 
-sm = pystan.StanModel(file=stan_code_file_name)
+    for i in range(len(f_rates)):
+        plt.subplot(rows, cols, i + 1, sharey=ax)
 
-fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9))
+        plt.boxplot(f_rates[i], labels=x_ax)
 
-pars = fit.extract()
+        plt.title(titles[i])
+        plt.xlabel('Acceptance rate')
+        plt.ylabel('Failure rate')
+        plt.grid()
 
-print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
+    plt.tight_layout()
+    plt.subplots_adjust(top=0.85)
+    plt.suptitle(title, y=0.96, weight='bold')
 
-plt.figure(figsize=(15,30))
+    plt.savefig(save_name + '_diagnostic_plot')
 
-fit.plot();
+    plt.show()
 
-plt.savefig(save_name + '_stan_diagnostic_plot')
+nIter = 1
 
-plt.show()
-plt.close('all')
+failure_rates = np.zeros((8, 4))
+failure_stds = np.zeros((8, 4))
 
-# Bayes
+f_rate_true = np.zeros((nIter, 8))
+f_rate_label = np.zeros((nIter, 8))
+f_rate_cont = np.zeros((nIter, 8))
+f_rate_cf = np.zeros((nIter, 8))
 
-# Alusta matriisi, rivillä yksi otos posteriorista
-# sarakkeet havaintoja
-y_imp = np.ones((pars['y_est'].shape[0], test.shape[0]))
 
-# Täydennetään havaitsemattomat estimoiduilla
-y_imp[:, ~observed] = 1-pars['y_est']
+for i in range(nIter):
 
-# Täydennetään havaitut havaituilla
-y_imp[:, observed] = 1-test.loc[observed, 'result_Y']
+    # Split data by the judges
+    train = compas_labeled[compas_labeled.judge_ID <=4]
+    test_labeled = compas_labeled[compas_labeled.judge_ID >4]
+    
+    # Assign same observations to unlabeled data
+    test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values]
+    
+    
+    # Train a logistic regression model
+    B_model, predictions = fitPredictiveModel(
+        train.loc[train['decision_T'] == 1, feature_cols],
+        train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0)
+    
+    # Attach predictions to data
+    test_labeled = test_labeled.assign(B_prob_0_model=predictions)
+    test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
+    
+    test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
+    
+    kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
+    
+    # Find observed values
+    observed = test_labeled['decision_T'] == 1
+    
+    # Assign data to the model
+    dat = dict(D=10,
+               N_obs=np.sum(observed),
+               N_cens=np.sum(~observed),
+               K=group_amount,
+               sigma_tau=sigma_tau,
+               M=len(set(compas_labeled['judge_ID'])),
+               jj_obs=test_labeled.loc[observed, 'judge_ID']+1,
+               jj_cens=test_labeled.loc[~observed, 'judge_ID']+1,
+               kk_obs=kk_array[observed]+1,
+               kk_cens=kk_array[~observed]+1,
+               dec_obs=test_labeled.loc[observed, 'decision_T'],
+               dec_cens=test_labeled.loc[~observed, 'decision_T'],
+               X_obs=test_labeled.loc[observed, feature_cols.values],
+               X_cens=test_labeled.loc[~observed, feature_cols.values],
+               y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
+    
+    sm = pystan.StanModel(file=stan_code_file_name)
+    
+    # Do the sampling
+    fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.95))
+        
+    pars = fit.extract()
+    
+    plt.figure(figsize=(15,30))
 
-Rs = np.arange(.1, .9, .1)
+    fit.plot();
 
-to_release_list = np.round(test.shape[0] * Rs).astype(int)
+    plt.savefig(save_name + '_stan_convergence_diagnostic_plot_' + str(i))
+    
+    plt.show()
+    plt.close('all')
+    gc.collect()
 
-f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan)
+    print(fit, file=open(save_name + '_stan_fit_diagnostics_' + str(i) + '.txt', 'w'))
 
-for i in range(len(to_release_list)):
-    est_failure_rates = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) / test.shape[0]
+    # Bayes
+    
+    # Format matrix, each row is a sample of the posterior
+    # columns are observations
+        
+    y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
+    
+    # Impute the unobserved with the estimated 
+    # Revers the binary coding to compute failures as sum 0 == 0 = 1
+    y_imp[:, ~observed] = 1 - pars['y_est']
+    
+    # Remove the variables to conserve memory
+    del(pars)
+    del(fit)
     
-    f_rate_bayes[:, i] = est_failure_rates
     
-    failure_rates[i, 5] = np.mean(est_failure_rates)    
+    # Impute the observed
+    y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y']
+   
+    for r in range(1, 9):
         
-for i in range(nIter):
+        to_release = np.round(test_labeled.shape[0] * (r / 10)).astype(int)
+                        
+        failed = np.sum(y_imp[:, 0:to_release], axis=1)
+            
+        f_rate_cf[i, r - 1] = np.mean(failed / test_labeled.shape[0])
     
-    print(" [", i, "] ", sep='', end="")
-
-    for r in np.arange(1, 9):
-
         print(".", end="")
-
+    
         # True evaluation
-
-        f_rate_true[i, r - 1] = trueEvaluationEvaluator(
-            compas_unlabeled, feature_cols, 'decision_T', 'result_Y', r / 10)
-
+    
+        f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled, 
+                   'result_Y', r / 10)
+            
         # Labeled outcomes only
-
-        f_rate_label[i, r - 1] = labeledOutcomesEvaluator(
-            compas_labeled, feature_cols, 'decision_T', 'result_Y', r / 10)
-
-        # Adjusted labeled outcomes
-
-        f_rate_label_adj[i, r - 1] = labeledOutcomesEvaluator(
-            compas_labeled,
-            feature_cols,
-            'decision_T',
-            'result_Y',
-            r / 10,
-            adjusted=True)
-
-        # Human evaluation
-
-        f_rate_human[i, r - 1] = humanEvaluationEvaluator(
-            compas_labeled, 'judge_ID', 'decision_T', 'result_Y',
-            'judge_leniency', r / 10)
-
+    
+        f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled, 
+                    'decision_T', 'result_Y', r / 10)
+    
         # Contraction
-
-        f_rate_cont[i, r - 1] = contractionEvaluator(
-            compas_labeled, feature_cols, 'judge_ID', 'decision_T', 'result_Y',
-            'judge_leniency', r / 10)
-
+    
+        f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judge_ID', 
+                   'decision_T', 'result_Y', 'B_prob_0_model', 
+                   'judge_leniency', r / 10)
+    
 failure_rates[:, 0] = np.mean(f_rate_true, axis=0)
 failure_rates[:, 1] = np.mean(f_rate_label, axis=0)
-failure_rates[:, 2] = np.mean(f_rate_label_adj, axis=0)
-failure_rates[:, 3] = np.mean(f_rate_human, axis=0)
-failure_rates[:, 4] = np.mean(f_rate_cont, axis=0)
+failure_rates[:, 2] = np.mean(f_rate_cont, axis=0)
+failure_rates[:, 3] = np.mean(f_rate_cf, axis=0)
 
-failure_sems[:, 0] = scs.sem(f_rate_true, axis=0)
-failure_sems[:, 1] = scs.sem(f_rate_label, axis=0)
-failure_sems[:, 2] = scs.sem(f_rate_label_adj, axis=0)
-failure_sems[:, 3] = scs.sem(f_rate_human, axis=0)
-failure_sems[:, 4] = scs.sem(f_rate_cont, axis=0)
-failure_sems[:, 5] = scs.sem(f_rate_bayes, axis=0, nan_policy='omit')
+failure_stds[:, 0] = np.std(f_rate_true, axis=0)
+failure_stds[:, 1] = np.std(f_rate_label, axis=0)
+failure_stds[:, 2] = np.std(f_rate_cont, axis=0)
+failure_stds[:, 3] = np.std(f_rate_cf, axis=0)
 
 x_ax = np.arange(0.1, 0.9, 0.1)
 
-labels = [
-    'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
-    'Human evaluation', 'Contraction', 'Potential outcomes'
-]
-colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c']
+labels = ['True Evaluation', 'Labeled outcomes', 'Contraction', 
+          'Counterfactuals']
+
+colours = ['g', 'magenta', 'b', 'r']
+
+line_styles = ['--', ':', '-.', '-']
 
 for i in range(failure_rates.shape[1]):
     plt.errorbar(x_ax,
                  failure_rates[:, i],
                  label=labels[i],
                  c=colours[i],
-                 yerr=failure_sems[:, i])
+                 linestyle=line_styles[i],
+                 yerr=failure_stds[:, i])
 
-plt.title('Failure rate vs. Acceptance rate')
+#plt.title('FR vs. AR')
 plt.xlabel('Acceptance rate')
 plt.ylabel('Failure rate')
 plt.legend()
@@ -580,50 +556,18 @@ plt.show()
 print("\nFailure rates:")
 print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
 
+print("\nStandard deviations:")
+print(np.array2string(failure_stds, formatter={'float_kind':lambda x: "%.5f" % x}))
+
+
 print("\nMean absolute errors:")
 for i in range(1, failure_rates.shape[1]):
     print(
         labels[i].ljust(len(max(labels, key=len))),
         np.round(
             np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
- 
-    
-# Draw diagnostic figures
-f_rates= [f_rate_true, f_rate_label, f_rate_label_adj, f_rate_human, 
-          f_rate_cont, f_rate_bayes]
-
-cols = 2
-rows = np.ceil(len(f_rates) / cols)
-
-plt.figure(figsize=(16, 4.5*rows+1))
-
-ax = plt.subplot(rows, cols, 1)
-x_ax = np.arange(1, 9, 1) / 10
-
-plt.boxplot(f_rates[0], labels=x_ax)
-
-plt.title(labels[0])
-plt.xlabel('Acceptance rate')
-plt.ylabel('Failure rate')
-plt.grid()
-
-for i in range(len(f_rates)):
-    plt.subplot(rows, cols, i + 1, sharey=ax)
-
-    plt.boxplot(f_rates[i], labels=x_ax)
-
-    plt.title(labels[i])
-    plt.xlabel('Acceptance rate')
-    plt.ylabel('Failure rate')
-    plt.grid()
-
-plt.tight_layout()
-plt.subplots_adjust(top=0.89)
-
-title = "COMPAS data set\nFluctuation of failure rate estimates per method"
-
-plt.suptitle(title, y=0.96, weight='bold')
-
-plt.savefig(save_name + '_diagnostic_plot')
 
-plt.show()
\ No newline at end of file
+drawDiagnostics(title="Variation of failure rate estimates by method\nCOMPAS data set",
+                save_name=save_name,
+                f_rates=[f_rate_true, f_rate_label, f_rate_cont, f_rate_cf],
+                titles=labels)
\ No newline at end of file
diff --git a/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py b/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py
deleted file mode 100644
index 18eb131..0000000
--- a/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py
+++ /dev/null
@@ -1,532 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-# Author: Riku Laine
-# Date: 26JUL2019
-# Project name: Potential outcomes in model evaluation
-# Description: This script creates the figures and results used 
-#              in empirical data experiments.
-#
-# Parameters:
-# -----------
-# (1) figure_path : file name for saving the created figures.
-# (2) group_amount : How many groups if Jung-inspired model is used.
-# (3) stan_code_file_name : Name of file containing the stan model code.
-# (4) sigma_tau : Values of prior variance for the Jung-inspired model.
-# (5) data_path : File of compas data.
-
-"""
-
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import scipy.special as ssp
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-import pystan
-
-plt.switch_backend('agg')
-
-import sys
-
-# figure storage name
-figure_path = sys.argv[1]
-
-# How many groups if jung model is used
-group_amount = int(sys.argv[2])
-
-# Name of stan model code file
-stan_code_file_name = sys.argv[3]
-
-# Variance prior
-sigma_tau = float(sys.argv[4])
-
-# figure storage name
-data_path = sys.argv[5]
-
-# Prefix for the figures and log files
-
-print("These results have been obtained with the following settings:")
-
-print("Number of groups:", group_amount)
-
-print("Prior for the variances:", sigma_tau)
-
-save_name = "sl_compas"
-
-def inv_logit(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
-
-
-def logit(x):
-    return np.log(x) - np.log(1.0 - x)
-
-
-def inverse_cumulative(x, mu, sigma):
-    '''Compute the inverse of the cumulative distribution of logit-normal
-    distribution at x with parameters mu and sigma (mean and st.dev.).'''
-
-    return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
-
-def standardError(p, n):
-    denominator = p * (1 - p)
-    
-    return np.sqrt(denominator / n)
-
-##########################
-
-# ## Evaluator modules
-
-# ### Convenience functions
-
-def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
-    '''
-    Fit a predictive model (default logistic regression) with given training 
-    instances and return probabilities for test instances to obtain a given 
-    class label.
-    
-    Arguments:
-    ----------
-    
-    x_train -- x values of training instances
-    y_train -- y values of training instances
-    x_test -- x values of test instances
-    class_value -- class label for which the probabilities are counted for.
-    model_type -- type of model to be fitted.
-    
-    Returns:
-    --------
-    (1) Trained predictive model
-    (2) Probabilities for given test inputs for given class.
-    '''
-
-    if model_type is None or model_type in ["logistic_regression", "lr"]:
-        # Instantiate the model (using the default parameters)
-        logreg = LogisticRegression(solver='lbfgs')
-
-        # Check shape and fit the model.
-        if x_train.ndim == 1:
-            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
-        else:
-            logreg = logreg.fit(x_train, y_train)
-
-        label_probs_logreg = getProbabilityForClass(x_test, logreg,
-                                                    class_value)
-
-        return logreg, label_probs_logreg
-
-    elif model_type in ["random_forest", "rf"]:
-        # Instantiate the model
-        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
-
-        # Check shape and fit the model.
-        if x_train.ndim == 1:
-            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
-        else:
-            forest = forest.fit(x_train, y_train)
-
-        label_probs_forest = getProbabilityForClass(x_test, forest,
-                                                    class_value)
-
-        return forest, label_probs_forest
-
-    elif model_type == "fully_random":
-
-        label_probs = np.ones_like(x_test) / 2
-
-        model_object = lambda x: 0.5
-
-        return model_object, label_probs
-    else:
-        raise ValueError("Invalid model_type!", model_type)
-
-
-def getProbabilityForClass(x, model, class_value):
-    '''
-    Function (wrapper) for obtaining the probability of a class given x and a 
-    predictive model.
-
-    Arguments:
-    -----------
-    x -- individual features, an array of shape (observations, features)
-    model -- a trained sklearn model. Predicts probabilities for given x. 
-        Should accept input of shape (observations, features)
-    class_value -- the resulting class to predict (usually 0 or 1).
-
-    Returns:
-    --------
-    (1) The probabilities of given class label for each x.
-    '''
-    if x.ndim == 1:
-        # if x is vector, transform to column matrix.
-        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
-    else:
-        f_values = model.predict_proba(x)
-
-    # Get correct column of predicted class, remove extra dimensions and return.
-    return f_values[:, model.classes_ == class_value].flatten()
-
-
-# ### Contraction algorithm
-# 
-# Below is an implementation of Lakkaraju's team's algorithm presented in 
-# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
-# parameters to be passed to the function are presented in the description.
-
-def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
-                accRateR_col, r):
-    '''
-    This is an implementation of the algorithm presented by Lakkaraju
-    et al. in their paper "The Selective Labels Problem: Evaluating 
-    Algorithmic Predictions in the Presence of Unobservables" (2017).
-
-    Arguments:
-    ----------
-    df -- The (Pandas) data frame containing the data, judge decisions,
-        judge IDs, results and probability scores.
-    judgeIDJ_col -- String, the name of the column containing the judges' IDs
-        in df.
-    decisionT_col -- String, the name of the column containing the judges' decisions
-    resultY_col -- String, the name of the column containing the realization
-    modelProbS_col -- String, the name of the column containing the probability
-        scores from the black-box model B.
-    accRateR_col -- String, the name of the column containing the judges' 
-        acceptance rates
-    r -- Float between 0 and 1, the given acceptance rate.
-
-    Returns:
-    --------
-    (1) The estimated failure rate at acceptance rate r.
-    '''
-    # Get ID of the most lenient judge.
-    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
-
-    # Subset. "D_q is the set of all observations judged by q."
-    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
-
-    # All observations of R_q have observed outcome labels.
-    # "R_q is the set of observations in D_q with observed outcome labels."
-    R_q = D_q[D_q[decisionT_col] == 1].copy()
-
-    # Sort observations in R_q in descending order of confidence scores S and
-    # assign to R_sort_q.
-    # "Observations deemed as high risk by B are at the top of this list"
-    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
-
-    number_to_remove = int(
-        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
-
-    # "R_B is the list of observations assigned to t = 1 by B"
-    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
-
-    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
-
-
-# ### Evaluators
-
-def trueEvaluationEvaluator(df, resultY_col, r):
-
-    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(df.shape[0] * r))
-    
-    failed = df[resultY_col][0:to_release] == 0
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False):
-
-    df_observed = df.loc[df[decisionT_col] == 1, :]
-
-    df_observed = df_observed.sort_values(by='B_prob_0_model',
-                                              inplace=False,
-                                              ascending=True)
-
-    to_release = int(round(df_observed.shape[0] * r))
-    
-    failed = df_observed[resultY_col][0:to_release] == 0
-
-    if adjusted:
-        return np.mean(failed), df.shape[0]
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
-                             accRateR_col, r):
-
-    # Get judges with correct leniency as list
-    is_correct_leniency = df[accRateR_col].round(1) == r
-
-    # No judges with correct leniency
-    if np.sum(is_correct_leniency) == 0:
-        return np.nan, np.nan
-
-    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
-
-    # Released are the people they judged and released, T = 1
-    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
-                  & (df[decisionT_col] == 1)]
-
-    failed = released[resultY_col] == 0
-    
-    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
-    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
-
-###################
-
-# Read in the data
-compas_raw = pd.read_csv(data_path)
-
-# Select columns
-compas = compas_raw[[
-    'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex',
-    'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
-    'two_year_recid', 'c_jail_in', 'c_jail_out'
-]]
-
-# Subset values, see reasons in ProPublica methodology.
-compas = compas.query('days_b_screening_arrest <= 30 and \
-                      days_b_screening_arrest >= -30 and \
-                      is_recid != -1 and \
-                      c_charge_degree != "O"')
-
-# Drop row if score_text is na
-compas = compas[compas.score_text.notnull()]
-
-# Recode recidivism values to fit earlier notation
-# So here result_Y = 1 - is_recid (inverted binary coding).
-compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1)
-
-# Convert string values to dummies, drop first so full rank
-compas_dummy = pd.get_dummies(
-    compas,
-    columns=['c_charge_degree', 'race', 'age_cat', 'sex'],
-    drop_first=True)
-
-compas_dummy.drop(columns=[
-    'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
-    'two_year_recid', 'score_text', 'is_recid'
-],
-    inplace=True)
-
-# Shuffle rows for random judge assignment
-compas_shuffled = compas_dummy.sample(frac=1)
-
-nJudges_M = 9
-
-# Assign judges as evenly as possible
-judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False)
-
-# Assign fixed leniencies from 0.1 to 0.9
-judge_leniency = np.arange(1, 10) / 10
-
-judge_leniency = judge_leniency[judge_ID]
-
-compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID,
-                                         judge_leniency=judge_leniency)
-
-# Sort by judges then probabilities in decreasing order
-# Most dangerous for each judge are at the top.
-compas_shuffled.sort_values(by=["judge_ID", "decile_score"],
-                            ascending=False,
-                            inplace=True)
-
-# Iterate over the data. Subject will be given a negative decision
-# if they are in the top (1-r)*100% of the individuals the judge will judge.
-# I.e. if their within-judge-index is under 1 - acceptance threshold times
-# the number of subjects assigned to each judge they will receive a
-# negative decision.
-compas_shuffled.reset_index(drop=True, inplace=True)
-
-subjects_allocated = compas_shuffled.judge_ID.value_counts()
-
-compas_shuffled['judge_index'] = compas_shuffled.groupby('judge_ID').cumcount()
-
-compas_shuffled['decision_T'] = np.where(
-    compas_shuffled['judge_index'] < (1 - compas_shuffled['judge_leniency']) *
-    subjects_allocated[compas_shuffled['judge_ID']].values, 0, 1)
-
-compas_labeled = compas_shuffled.copy()
-compas_unlabeled = compas_shuffled.copy()
-
-# Hide unobserved
-compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan
-
-# Choose feature_columns
-feature_cols = ~compas_labeled.columns.isin(
-    ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T'])
-
-feature_cols = compas_labeled.columns[feature_cols]
-
-
-####### Draw figures ###########
-
-failure_rates = np.zeros((8, 6))
-error_Ns = np.zeros((8, 6))
-
-# Split data
-train, test_labeled = train_test_split(compas_labeled, test_size=0.5)
-
-# Assign same observations to unlabeled data
-test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values]
-
-
-# Train a logistic regression model
-B_model, predictions = fitPredictiveModel(
-    train.loc[train['decision_T'] == 1, feature_cols],
-    train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0)
-
-test_labeled = test_labeled.assign(B_prob_0_model=predictions)
-test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
-
-test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
-
-# Find observed values
-observed = test_labeled['decision_T'] == 1
-
-# Assign data to the model
-dat = dict(D=10,
-           N_obs=np.sum(observed),
-           N_cens=np.sum(~observed),
-           K=group_amount,
-           sigma_tau=sigma_tau,
-           M=len(set(compas_labeled['judge_ID'])),
-           jj_obs=test_labeled.loc[observed, 'judge_ID']+1,
-           jj_cens=test_labeled.loc[~observed, 'judge_ID']+1,
-           kk_obs=kk_array[observed]+1,
-           kk_cens=kk_array[~observed]+1,
-           dec_obs=test_labeled.loc[observed, 'decision_T'],
-           dec_cens=test_labeled.loc[~observed, 'decision_T'],
-           X_obs=test_labeled.loc[observed, feature_cols.values],
-           X_cens=test_labeled.loc[~observed, feature_cols.values],
-           y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
-
-sm = pystan.StanModel(file=stan_code_file_name)
-
-fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9))
-
-pars = fit.extract()
-
-print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
-
-plt.figure(figsize=(15,30))
-
-fit.plot();
-
-plt.savefig(save_name + '_stan_diagnostic_plot')
-
-plt.show()
-plt.close('all')
-
-# Bayes
-
-# Alusta matriisi, rivillä yksi otos posteriorista
-# sarakkeet havaintoja
-y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
-
-# Täydennetään havaitsemattomat estimoiduilla
-y_imp[:, ~observed] = 1-pars['y_est']
-
-# Täydennetään havaitut havaituilla
-y_imp[:, observed] = 1-test_labeled.loc[observed, 'result_Y']
-
-Rs = np.arange(.1, .9, .1)
-
-to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int)
-
-f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan)
-
-for i in range(len(to_release_list)):
-    
-    failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1)
-    
-    est_failure_rates =  failed / test_labeled.shape[0]
-            
-    failure_rates[i, 5] = np.mean(est_failure_rates)
-    
-    error_Ns[i, 5] = test_labeled.shape[0]
-
-for r in np.arange(1, 9):
-
-    print(".", end="")
-
-    # True evaluation
-
-    FR, N = trueEvaluationEvaluator(test_unlabeled, 'result_Y', r / 10)
-    
-    failure_rates[r - 1, 0] = FR
-    error_Ns[r - 1, 0] = N
-
-    # Labeled outcomes only
-
-    FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10)
-
-    failure_rates[r - 1, 1] = FR
-    error_Ns[r - 1, 1] = N
-    
-    # Adjusted labeled outcomes
-
-    FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10,
-                                     adjusted=True)
-
-    failure_rates[r - 1, 2] = FR
-    error_Ns[r - 1, 2] = N
-
-    # Human evaluation
-
-    FR, N = humanEvaluationEvaluator(test_labeled, 'judge_ID', 'decision_T', 
-                                     'result_Y',  'judge_leniency', 
-                                     r / 10)
-
-    failure_rates[r - 1, 3] = FR
-    error_Ns[r - 1, 3] = N
-
-    # Contraction
-
-    FR, N = contraction(test_labeled, 'judge_ID', 'decision_T', 'result_Y', 
-                        'B_prob_0_model', 'judge_leniency', r / 10)
-
-    failure_rates[r - 1, 4] = FR
-    error_Ns[r - 1, 4] = N
-
-x_ax = np.arange(0.1, 0.9, 0.1)
-
-failure_SEs = standardError(failure_rates, error_Ns)
-
-labels = [
-    'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
-    'Human evaluation', 'Contraction', 'Potential outcomes'
-]
-colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c']
-
-for i in range(failure_rates.shape[1]):
-    plt.errorbar(x_ax,
-                 failure_rates[:, i],
-                 label=labels[i],
-                 c=colours[i],
-                 yerr=failure_SEs[:, i])
-
-plt.title('Failure rate vs. Acceptance rate')
-plt.xlabel('Acceptance rate')
-plt.ylabel('Failure rate')
-plt.legend()
-plt.grid()
-
-plt.savefig(save_name + '_all')
-
-plt.show()
-
-print("\nFailure rates:")
-print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
-
-print("\nMean absolute errors:")
-for i in range(1, failure_rates.shape[1]):
-    print(
-        labels[i].ljust(len(max(labels, key=len))),
-        np.round(
-            np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
\ No newline at end of file
diff --git a/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py b/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py
deleted file mode 100644
index 49e23f2..0000000
--- a/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py
+++ /dev/null
@@ -1,532 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-# Author: Riku Laine
-# Date: 26JUL2019
-# Project name: Potential outcomes in model evaluation
-# Description: This script creates the figures and results used 
-#              in empirical data experiments.
-#
-# Parameters:
-# -----------
-# (1) figure_path : file name for saving the created figures.
-# (2) group_amount : How many groups if Jung-inspired model is used.
-# (3) stan_code_file_name : Name of file containing the stan model code.
-# (4) sigma_tau : Values of prior variance for the Jung-inspired model.
-# (5) data_path : File of compas data.
-
-"""
-
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import scipy.special as ssp
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-import pystan
-
-plt.switch_backend('agg')
-
-import sys
-
-# figure storage name
-figure_path = sys.argv[1]
-
-# How many groups if jung model is used
-group_amount = int(sys.argv[2])
-
-# Name of stan model code file
-stan_code_file_name = sys.argv[3]
-
-# Variance prior
-sigma_tau = float(sys.argv[4])
-
-# figure storage name
-data_path = sys.argv[5]
-
-# Prefix for the figures and log files
-
-print("These results have been obtained with the following settings:")
-
-print("Number of groups:", group_amount)
-
-print("Prior for the variances:", sigma_tau)
-
-save_name = "sl_compas"
-
-def inv_logit(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
-
-
-def logit(x):
-    return np.log(x) - np.log(1.0 - x)
-
-
-def inverse_cumulative(x, mu, sigma):
-    '''Compute the inverse of the cumulative distribution of logit-normal
-    distribution at x with parameters mu and sigma (mean and st.dev.).'''
-
-    return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
-
-def standardError(p, n):
-    denominator = p * (1 - p)
-    
-    return np.sqrt(denominator / n)
-
-##########################
-
-# ## Evaluator modules
-
-# ### Convenience functions
-
-def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
-    '''
-    Fit a predictive model (default logistic regression) with given training 
-    instances and return probabilities for test instances to obtain a given 
-    class label.
-    
-    Arguments:
-    ----------
-    
-    x_train -- x values of training instances
-    y_train -- y values of training instances
-    x_test -- x values of test instances
-    class_value -- class label for which the probabilities are counted for.
-    model_type -- type of model to be fitted.
-    
-    Returns:
-    --------
-    (1) Trained predictive model
-    (2) Probabilities for given test inputs for given class.
-    '''
-
-    if model_type is None or model_type in ["logistic_regression", "lr"]:
-        # Instantiate the model (using the default parameters)
-        logreg = LogisticRegression(solver='lbfgs')
-
-        # Check shape and fit the model.
-        if x_train.ndim == 1:
-            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
-        else:
-            logreg = logreg.fit(x_train, y_train)
-
-        label_probs_logreg = getProbabilityForClass(x_test, logreg,
-                                                    class_value)
-
-        return logreg, label_probs_logreg
-
-    elif model_type in ["random_forest", "rf"]:
-        # Instantiate the model
-        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
-
-        # Check shape and fit the model.
-        if x_train.ndim == 1:
-            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
-        else:
-            forest = forest.fit(x_train, y_train)
-
-        label_probs_forest = getProbabilityForClass(x_test, forest,
-                                                    class_value)
-
-        return forest, label_probs_forest
-
-    elif model_type == "fully_random":
-
-        label_probs = np.ones_like(x_test) / 2
-
-        model_object = lambda x: 0.5
-
-        return model_object, label_probs
-    else:
-        raise ValueError("Invalid model_type!", model_type)
-
-
-def getProbabilityForClass(x, model, class_value):
-    '''
-    Function (wrapper) for obtaining the probability of a class given x and a 
-    predictive model.
-
-    Arguments:
-    -----------
-    x -- individual features, an array of shape (observations, features)
-    model -- a trained sklearn model. Predicts probabilities for given x. 
-        Should accept input of shape (observations, features)
-    class_value -- the resulting class to predict (usually 0 or 1).
-
-    Returns:
-    --------
-    (1) The probabilities of given class label for each x.
-    '''
-    if x.ndim == 1:
-        # if x is vector, transform to column matrix.
-        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
-    else:
-        f_values = model.predict_proba(x)
-
-    # Get correct column of predicted class, remove extra dimensions and return.
-    return f_values[:, model.classes_ == class_value].flatten()
-
-
-# ### Contraction algorithm
-# 
-# Below is an implementation of Lakkaraju's team's algorithm presented in 
-# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
-# parameters to be passed to the function are presented in the description.
-
-def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
-                accRateR_col, r):
-    '''
-    This is an implementation of the algorithm presented by Lakkaraju
-    et al. in their paper "The Selective Labels Problem: Evaluating 
-    Algorithmic Predictions in the Presence of Unobservables" (2017).
-
-    Arguments:
-    ----------
-    df -- The (Pandas) data frame containing the data, judge decisions,
-        judge IDs, results and probability scores.
-    judgeIDJ_col -- String, the name of the column containing the judges' IDs
-        in df.
-    decisionT_col -- String, the name of the column containing the judges' decisions
-    resultY_col -- String, the name of the column containing the realization
-    modelProbS_col -- String, the name of the column containing the probability
-        scores from the black-box model B.
-    accRateR_col -- String, the name of the column containing the judges' 
-        acceptance rates
-    r -- Float between 0 and 1, the given acceptance rate.
-
-    Returns:
-    --------
-    (1) The estimated failure rate at acceptance rate r.
-    '''
-    # Get ID of the most lenient judge.
-    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
-
-    # Subset. "D_q is the set of all observations judged by q."
-    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
-
-    # All observations of R_q have observed outcome labels.
-    # "R_q is the set of observations in D_q with observed outcome labels."
-    R_q = D_q[D_q[decisionT_col] == 1].copy()
-
-    # Sort observations in R_q in descending order of confidence scores S and
-    # assign to R_sort_q.
-    # "Observations deemed as high risk by B are at the top of this list"
-    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
-
-    number_to_remove = int(
-        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
-
-    # "R_B is the list of observations assigned to t = 1 by B"
-    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
-
-    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
-
-
-# ### Evaluators
-
-def trueEvaluationEvaluator(df, resultY_col, r):
-
-    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(df.shape[0] * r))
-    
-    failed = df[resultY_col][0:to_release] == 0
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False):
-
-    df_observed = df.loc[df[decisionT_col] == 1, :]
-
-    df_observed = df_observed.sort_values(by='B_prob_0_model',
-                                              inplace=False,
-                                              ascending=True)
-
-    to_release = int(round(df_observed.shape[0] * r))
-    
-    failed = df_observed[resultY_col][0:to_release] == 0
-
-    if adjusted:
-        return np.mean(failed), df.shape[0]
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
-                             accRateR_col, r):
-
-    # Get judges with correct leniency as list
-    is_correct_leniency = df[accRateR_col].round(1) == r
-
-    # No judges with correct leniency
-    if np.sum(is_correct_leniency) == 0:
-        return np.nan, np.nan
-
-    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
-
-    # Released are the people they judged and released, T = 1
-    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
-                  & (df[decisionT_col] == 1)]
-
-    failed = released[resultY_col] == 0
-    
-    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
-    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
-
-###################
-
-# Read in the data
-compas_raw = pd.read_csv(data_path)
-
-# Select columns
-compas = compas_raw[[
-    'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex',
-    'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
-    'two_year_recid', 'c_jail_in', 'c_jail_out'
-]]
-
-# Subset values, see reasons in ProPublica methodology.
-compas = compas.query('days_b_screening_arrest <= 30 and \
-                      days_b_screening_arrest >= -30 and \
-                      is_recid != -1 and \
-                      c_charge_degree != "O"')
-
-# Drop row if score_text is na
-compas = compas[compas.score_text.notnull()]
-
-# Recode recidivism values to fit earlier notation
-# So here result_Y = 1 - is_recid (inverted binary coding).
-compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1)
-
-# Convert string values to dummies, drop first so full rank
-compas_dummy = pd.get_dummies(
-    compas,
-    columns=['c_charge_degree', 'race', 'age_cat', 'sex'],
-    drop_first=True)
-
-compas_dummy.drop(columns=[
-    'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
-    'two_year_recid', 'score_text', 'is_recid'
-],
-    inplace=True)
-
-# Shuffle rows for random judge assignment
-compas_shuffled = compas_dummy.sample(frac=1)
-
-nJudges_M = 9
-
-# Assign judges as evenly as possible
-judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False)
-
-# Assign fixed leniencies from 0.1 to 0.9
-judge_leniency = np.arange(1, 10) / 10
-
-judge_leniency = judge_leniency[judge_ID]
-
-compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID,
-                                         judge_leniency=judge_leniency)
-
-# Sort by judges then probabilities in decreasing order
-# Most dangerous for each judge are at the top.
-compas_shuffled.sort_values(by=["judge_ID", "decile_score"],
-                            ascending=False,
-                            inplace=True)
-
-# Iterate over the data. Subject will be given a negative decision
-# if they are in the top (1-r)*100% of the individuals the judge will judge.
-# I.e. if their within-judge-index is under 1 - acceptance threshold times
-# the number of subjects assigned to each judge they will receive a
-# negative decision.
-compas_shuffled.reset_index(drop=True, inplace=True)
-
-subjects_allocated = compas_shuffled.judge_ID.value_counts()
-
-compas_shuffled['judge_index'] = compas_shuffled.groupby('judge_ID').cumcount()
-
-compas_shuffled['decision_T'] = np.where(
-    compas_shuffled['judge_index'] < (1 - compas_shuffled['judge_leniency']) *
-    subjects_allocated[compas_shuffled['judge_ID']].values, 0, 1)
-
-compas_labeled = compas_shuffled.copy()
-compas_unlabeled = compas_shuffled.copy()
-
-# Hide unobserved
-compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan
-
-# Choose feature_columns
-feature_cols = ~compas_labeled.columns.isin(
-    ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T'])
-
-feature_cols = compas_labeled.columns[feature_cols]
-
-
-####### Draw figures ###########
-
-failure_rates = np.zeros((8, 6))
-error_Ns = np.zeros((8, 6))
-
-# Split data
-train, test_labeled = train_test_split(compas_labeled, test_size=0.5)
-
-# Assign same observations to unlabeled data
-test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values]
-
-
-# Train a logistic regression model
-B_model, predictions = fitPredictiveModel(
-    train.loc[train['decision_T'] == 1, feature_cols],
-    train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0)
-
-test_labeled = test_labeled.assign(B_prob_0_model=predictions)
-test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
-
-test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
-
-# Find observed values
-observed = test_labeled['decision_T'] == 1
-
-# Assign data to the model
-dat = dict(D=1,
-           N_obs=np.sum(observed),
-           N_cens=np.sum(~observed),
-           K=group_amount,
-           sigma_tau=sigma_tau,
-           M=len(set(compas_labeled['judge_ID'])),
-           jj_obs=test_labeled.loc[observed, 'judge_ID']+1,
-           jj_cens=test_labeled.loc[~observed, 'judge_ID']+1,
-           kk_obs=kk_array[observed]+1,
-           kk_cens=kk_array[~observed]+1,
-           dec_obs=test_labeled.loc[observed, 'decision_T'],
-           dec_cens=test_labeled.loc[~observed, 'decision_T'],
-           X_obs=test_labeled.loc[observed, 'B_prob_0_model'].values.reshape(-1, 1),
-           X_cens=test_labeled.loc[~observed, 'B_prob_0_model'].values.reshape(-1, 1),
-           y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
-
-sm = pystan.StanModel(file=stan_code_file_name)
-
-fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9))
-
-pars = fit.extract()
-
-print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
-
-plt.figure(figsize=(15,30))
-
-fit.plot();
-
-plt.savefig(save_name + '_stan_diagnostic_plot')
-
-plt.show()
-plt.close('all')
-
-# Bayes
-
-# Alusta matriisi, rivillä yksi otos posteriorista
-# sarakkeet havaintoja
-y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
-
-# Täydennetään havaitsemattomat estimoiduilla
-y_imp[:, ~observed] = 1-pars['y_est']
-
-# Täydennetään havaitut havaituilla
-y_imp[:, observed] = 1-test_labeled.loc[observed, 'result_Y']
-
-Rs = np.arange(.1, .9, .1)
-
-to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int)
-
-f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan)
-
-for i in range(len(to_release_list)):
-    
-    failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1)
-    
-    est_failure_rates =  failed / test_labeled.shape[0]
-            
-    failure_rates[i, 5] = np.mean(est_failure_rates)
-    
-    error_Ns[i, 5] = test_labeled.shape[0]
-
-for r in np.arange(1, 9):
-
-    print(".", end="")
-
-    # True evaluation
-
-    FR, N = trueEvaluationEvaluator(test_unlabeled, 'result_Y', r / 10)
-    
-    failure_rates[r - 1, 0] = FR
-    error_Ns[r - 1, 0] = N
-
-    # Labeled outcomes only
-
-    FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10)
-
-    failure_rates[r - 1, 1] = FR
-    error_Ns[r - 1, 1] = N
-    
-    # Adjusted labeled outcomes
-
-    FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10,
-                                     adjusted=True)
-
-    failure_rates[r - 1, 2] = FR
-    error_Ns[r - 1, 2] = N
-
-    # Human evaluation
-
-    FR, N = humanEvaluationEvaluator(test_labeled, 'judge_ID', 'decision_T', 
-                                     'result_Y',  'judge_leniency', 
-                                     r / 10)
-
-    failure_rates[r - 1, 3] = FR
-    error_Ns[r - 1, 3] = N
-
-    # Contraction
-
-    FR, N = contraction(test_labeled, 'judge_ID', 'decision_T', 'result_Y', 
-                        'B_prob_0_model', 'judge_leniency', r / 10)
-
-    failure_rates[r - 1, 4] = FR
-    error_Ns[r - 1, 4] = N
-
-x_ax = np.arange(0.1, 0.9, 0.1)
-
-failure_SEs = standardError(failure_rates, error_Ns)
-
-labels = [
-    'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
-    'Human evaluation', 'Contraction', 'Potential outcomes'
-]
-colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c']
-
-for i in range(failure_rates.shape[1]):
-    plt.errorbar(x_ax,
-                 failure_rates[:, i],
-                 label=labels[i],
-                 c=colours[i],
-                 yerr=failure_SEs[:, i])
-
-plt.title('Failure rate vs. Acceptance rate')
-plt.xlabel('Acceptance rate')
-plt.ylabel('Failure rate')
-plt.legend()
-plt.grid()
-
-plt.savefig(save_name + '_all')
-
-plt.show()
-
-print("\nFailure rates:")
-print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
-
-print("\nMean absolute errors:")
-for i in range(1, failure_rates.shape[1]):
-    print(
-        labels[i].ljust(len(max(labels, key=len))),
-        np.round(
-            np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
\ No newline at end of file
diff --git a/analysis_and_scripts/stan_modelling_theoretic.py b/analysis_and_scripts/stan_modelling_theoretic.py
index 4f3c5a9..47847a9 100644
--- a/analysis_and_scripts/stan_modelling_theoretic.py
+++ b/analysis_and_scripts/stan_modelling_theoretic.py
@@ -1,29 +1,24 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 '''
 # Author: Riku Laine
-# Date: 25JUL2019 (start)
+# Date: 12AUG2019
 # Project name: Potential outcomes in model evaluation
 # Description: This script creates the figures and results used 
 #              in synthetic data experiments.
 #
 # Parameters:
 # -----------
-# (1) figure_path : file name for saving the created figures.
+# (1) figure_path : file location for saving the created figures.
 # (2) N_sim : Size of simulated data set.
 # (3) M_sim : Number of judges in simulated data, 
 #             N_sim must be divisible by M_sim!
-# (4) which : Which data + outcome analysis should be performed.
-# (5) group_amount : How many groups if Jung-inspired model is used.
+# (4) which : Which data + outcome analysis should be performed. Ranges from
+#             1 to 14. See at the end of the script which figure is which.
+#             Separate execution preferred for keeping runtime and memory decent.
+# (5) group_amount : How many groups if non-linear model is used.
 # (6) stan_code_file_name : Name of file containing the stan model code.
-# (7) sigma_tau : Values of prior variance for the Jung-inspired model.
-# 
-# Modifications:
-# --------------
-# 26JUL2019 : RL - Changed add_epsilon default to True
-#                - Corrected the data set used by the causal model (from 
-#                  unlabeled to labeled)
-#                - Corrections to which == 11, coefficients and prints.
-#                - All plot, thinning off.
-#                - Fix one decider with leniency 0.9
+# (7) sigma_tau : Values of prior variance.
 #
 '''
 # Refer to the `notes.tex` file for explanations about the modular framework.
@@ -35,7 +30,6 @@ import pandas as pd
 import matplotlib.pyplot as plt
 import scipy.stats as scs
 import scipy.special as ssp
-import scipy.integrate as si
 import numpy.random as npr
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
@@ -85,8 +79,7 @@ print("Prior for the variances:", sigma_tau)
 
 # Basic functions
 
-
-def inv_logit(x):
+def inverseLogit(x):
     return 1.0 / (1.0 + np.exp(-1.0 * x))
 
 
@@ -94,11 +87,16 @@ def logit(x):
     return np.log(x) - np.log(1.0 - x)
 
 
-def inverse_cumulative(x, mu, sigma):
+def inverseCumulative(x, mu, sigma):
     '''Compute the inverse of the cumulative distribution of logit-normal
     distribution at x with parameters mu and sigma (mean and st.dev.).'''
 
-    return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
+    return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
+
+def standardError(p, n):
+    denominator = p * (1 - p)
+    
+    return np.sqrt(denominator / n)
 
 
 # ## Data generation modules
@@ -111,11 +109,11 @@ def bernoulliDGWithoutUnobservables(N_total=50000):
     # Sample feature X from standard Gaussian distribution, N(0, 1).
     df = df.assign(X=npr.normal(size=N_total))
 
-    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inv_logit(X)
-    df = df.assign(probabilities_Y=inv_logit(df.X))
+    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
+    df = df.assign(probabilities_Y=inverseLogit(df.X))
 
-    # Draw Y ~ Bernoulli(1 - inv_logit(X))
-    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inv_logit(X)
+    # Draw Y ~ Bernoulli(1 - inverseLogit(X))
+    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X)
     results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
 
     df = df.assign(result_Y=results)
@@ -137,7 +135,7 @@ def thresholdDGWithUnobservables(N_total=50000,
     df = df.assign(W=npr.normal(size=N_total))
 
     # Calculate P(Y=0|X, Z, W)
-    probabilities_Y = inv_logit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
+    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
 
     df = df.assign(probabilities_Y=probabilities_Y)
 
@@ -160,8 +158,8 @@ def bernoulliDGWithUnobservables(N_total=50000,
     df = df.assign(Z=npr.normal(size=N_total))
     df = df.assign(W=npr.normal(size=N_total))
 
-    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inv_logit(X)
-    probabilities_Y = inv_logit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
+    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
+    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
 
     df = df.assign(probabilities_Y=probabilities_Y)
 
@@ -183,6 +181,7 @@ def humanDeciderLakkaraju(df,
                           beta_Z=1,
                           add_epsilon=True):
     '''Decider module | Non-independent batch decisions.'''
+    # Decider as specified by Lakkaraju et al.
 
     # Assert that every judge will have the same number of subjects.
     assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
@@ -195,23 +194,24 @@ def humanDeciderLakkaraju(df,
 
     # Sample acceptance rates uniformly from a closed interval
     # from 0.1 to 0.9 and round to tenth decimal place.
-    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
-    #            values.
-    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+    acceptance_rates = npr.uniform(.1, .9, nJudges_M)
     acceptance_rates = np.round(acceptance_rates, 10)
 
+
     # Replicate the rates so they can be attached to the corresponding judge ID.
     df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
 
+    # Add noise
     if add_epsilon:
         epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
     else:
         epsilon = 0
     
+    # Compute probability for jail decision
     if featureZ_col is None:
-        probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon)
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
     else:
-        probabilities_T = inv_logit(beta_X * df[featureX_col] +
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                     beta_Z * df[featureZ_col] + epsilon)
 
 
@@ -250,8 +250,9 @@ def bernoulliDecider(df,
                     beta_Z=1,
                     add_epsilon=True):
     '''Use X and Z to make a decision with probability 
-    P(T=0|X, Z)=inv_logit(beta_X*X+beta_Z*Z).'''
-
+    P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).'''
+    # No leniencies involved
+    
     # Assert that every judge will have the same number of subjects.
     assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
 
@@ -267,9 +268,9 @@ def bernoulliDecider(df,
         epsilon = 0
     
     if featureZ_col is None:
-        probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon)
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
     else:
-        probabilities_T = inv_logit(beta_X * df[featureX_col] +
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                     beta_Z * df[featureZ_col] + epsilon)
 
     df = df.assign(probabilities_T=probabilities_T)
@@ -298,7 +299,8 @@ def quantileDecider(df,
                     nJudges_M=100,
                     beta_X=1,
                     beta_Z=1,
-                    add_epsilon=True):
+                    add_epsilon=True,
+                    leniency_upper_limit=0.9):
     '''Assign decisions by the value of inverse cumulative distribution function
     of the logit-normal distribution at leniency r.'''
     
@@ -313,9 +315,7 @@ def quantileDecider(df,
 
     # Sample acceptance rates uniformly from a closed interval
     # from 0.1 to 0.9 and round to tenth decimal place.
-    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
-    #            values.
-    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+    acceptance_rates = npr.uniform(.1, leniency_upper_limit, nJudges_M)
     acceptance_rates = np.round(acceptance_rates, 10)
 
     # Replicate the rates so they can be attached to the corresponding judge ID.
@@ -327,20 +327,20 @@ def quantileDecider(df,
         epsilon = 0
     
     if featureZ_col is None:
-        probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon)
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
 
         # Compute the bounds straight from the inverse cumulative.
         # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2.
-        df = df.assign(bounds=inverse_cumulative(
+        df = df.assign(bounds=inverseCumulative(
             x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2)))
     else:
-        probabilities_T = inv_logit(beta_X * df[featureX_col] +
+        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                     beta_Z * df[featureZ_col] + epsilon)
 
         # Compute the bounds straight from the inverse cumulative.
         # Assuming X and Z are i.i.d standard Gaussians with variance 1.
         # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z).
-        df = df.assign(bounds=inverse_cumulative(
+        df = df.assign(bounds=inverseCumulative(
             x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2)))
 
     df = df.assign(probabilities_T=probabilities_T)
@@ -348,6 +348,12 @@ def quantileDecider(df,
     # Assign negative decision if the predicted probability (probabilities_T) is
     # over the judge's threshold (bounds).
     df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1))
+    
+    # Calculate the observed acceptance rates.
+    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values
+
+    # Replicate the rates so they can be attached to the corresponding judge ID.
+    df.acceptanceRate_R = np.repeat(acceptance_rates, nSubjects_N)
 
     df_labeled = df.copy()
 
@@ -360,7 +366,7 @@ def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
     '''Doesn't use any information about X and Z to make decisions.
     
     If use_acceptance_rates is False (default) then all decisions are positive
-    with probabiltiy 0.5. If True, probabilities will be sampled from 
+    with probability 0.5. If True, probabilities will be sampled from 
     U(0.1, 0.9) and rounded to tenth decimal place.'''
 
     # Assert that every judge will have the same number of subjects.
@@ -375,7 +381,10 @@ def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
     if use_acceptance_rates:
         # Sample acceptance rates uniformly from a closed interval
         # from 0.1 to 0.9 and round to tenth decimal place.
-        acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10)
+        # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
+        #            values.
+        acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
+        acceptance_rates = np.round(acceptance_rates, 10)
     else:
         # No real leniency here -> set to 0.5.
         acceptance_rates = np.ones(nJudges_M) * 0.5
@@ -402,7 +411,9 @@ def biasDecider(df,
                 add_epsilon=True):
     '''
     Biased decider: If X > 1, then X <- X * 0.75. People with high X, 
-    get more positive decisions as they should.
+    get more positive decisions as they should. And if -2 < X -1, then 
+    X <- X + 0.5. People with X in [-2, 1], get less positive decisions 
+    as they should.
     
     '''
 
@@ -420,7 +431,7 @@ def biasDecider(df,
     assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
 
     # Use quantile decider, but judge by the biased X.
-    df_labeled, df = humanDeciderLakkaraju(df,
+    df_labeled, df = quantileDecider(df,
                                      featureX_col='biased_X',
                                      featureZ_col=featureZ_col,
                                      nJudges_M=nJudges_M,
@@ -522,97 +533,6 @@ def getProbabilityForClass(x, model, class_value):
     # Get correct column of predicted class, remove extra dimensions and return.
     return f_values[:, model.classes_ == class_value].flatten()
 
-
-def cdf(x_0, model, class_value):
-    '''
-    Cumulative distribution function as described above. Integral is 
-    approximated using Simpson's rule for efficiency.
-    
-    Arguments:
-    ----------
-    
-    x_0 -- private features of an instance for which the value of cdf is to be
-        calculated.
-    model -- a trained sklearn model. Predicts probabilities for given x. 
-        Should accept input of shape (observations, features)
-    class_value -- the resulting class to predict (usually 0 or 1).
-
-    '''
-
-    def prediction(x):
-        return getProbabilityForClass(
-            np.array([x]).reshape(-1, 1), model, class_value)
-
-    prediction_x_0 = prediction(x_0)
-
-    x_values = np.linspace(-15, 15, 40000)
-
-    x_preds = prediction(x_values)
-
-    y_values = scs.norm.pdf(x_values)
-
-    results = np.zeros(x_0.shape[0])
-
-    for i in range(x_0.shape[0]):
-
-        y_copy = y_values.copy()
-
-        y_copy[x_preds > prediction_x_0[i]] = 0
-
-        results[i] = si.simps(y_copy, x=x_values)
-
-    return results
-
-
-def bailIndicator(r, y_model, x_train, x_test):
-    '''
-    Indicator function for whether a judge will bail or jail a suspect.
-    Rationale explained above.
-
-    Algorithm:
-    ----------
-
-    (1) Calculate recidivism probabilities from training set with a trained 
-        model and assign them to predictions_train.
-
-    (2) Calculate recidivism probabilities from test set with the trained 
-        model and assign them to predictions_test.
-
-    (3) Construct a quantile function of the probabilities in
-        in predictions_train.
-
-    (4)
-    For pred in predictions_test:
-
-        if pred belongs to a percentile (computed from step (3)) lower than r
-            return True
-        else
-            return False
-
-    Arguments:
-    ----------
-
-    r -- float, acceptance rate, between 0 and 1
-    y_model -- a trained sklearn predictive model to predict the outcome
-    x_train -- private features of the training instances
-    x_test -- private features of the test instances
-
-    Returns:
-    --------
-    (1) Boolean list indicating a bail decision (bail = True) for each 
-        instance in x_test.
-    '''
-
-    predictions_train = getProbabilityForClass(x_train, y_model, 0)
-
-    predictions_test = getProbabilityForClass(x_test, y_model, 0)
-
-    return [
-        scs.percentileofscore(predictions_train, pred, kind='weak') < r
-        for pred in predictions_test
-    ]
-
-
 # ### Contraction algorithm
 # 
 # Below is an implementation of Lakkaraju's team's algorithm presented in 
@@ -649,6 +569,29 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
 
     # Subset. "D_q is the set of all observations judged by q."
     D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
+    
+    ##### Agreement rate computation begins #######
+    
+    max_leniency = max(df[accRateR_col])
+    
+    if max_leniency < r:
+        return np.nan, np.nan
+    
+    # Sort D_q
+    # "Observations deemed as high risk by B are at the top of this list"
+    D_sort_q = D_q.sort_values(by=modelProbS_col, ascending=False)
+    
+    # The agreement rate is now computed as the percentage of all the positive
+    # decisions given by q in the 1-r % of the most dangerous subjects.
+    all_negatives = np.sum(D_sort_q[decisionT_col] == 0)
+    
+    n_most_dangerous = int(round(D_q.shape[0] * (1-max_leniency)))
+    
+    agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous] == 0)
+
+    print("The agreement rate of contraction is:", agreed_decisions/all_negatives)
+    
+    ##### Agreement rate computation ends #######
 
     # All observations of R_q have observed outcome labels.
     # "R_q is the set of observations in D_q with observed outcome labels."
@@ -665,50 +608,20 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
     # "R_B is the list of observations assigned to t = 1 by B"
     R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
 
-    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0]
+    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
 
 
 # ### Evaluators
 
-def contractionEvaluator(df, featureX_col, judgeIDJ_col, decisionT_col,
-                         resultY_col, accRateR_col, r):
-
-    train, test = train_test_split(df, test_size=0.5)
-
-    B_model, predictions = fitPredictiveModel(
-        train.loc[train[decisionT_col] == 1, featureX_col],
-        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
-        0)
-
-    test = test.assign(B_prob_0_model=predictions)
-
-    # Invoke the original contraction.
-    FR = contraction(test,
-                     judgeIDJ_col=judgeIDJ_col,
-                     decisionT_col=decisionT_col,
-                     resultY_col=resultY_col,
-                     modelProbS_col="B_prob_0_model",
-                     accRateR_col=accRateR_col,
-                     r=r)
-
-    return FR
-
-
 def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r):
 
-    train, test = train_test_split(df, test_size=0.5)
-
-    B_model, predictions = fitPredictiveModel(train[featureX_col],
-                                              train[resultY_col],
-                                              test[featureX_col], 0)
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
 
-    test = test.assign(B_prob_0_model=predictions)
-
-    test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(test.shape[0] * r))
+    to_release = int(round(df.shape[0] * r))
+    
+    failed = df[resultY_col][0:to_release] == 0
 
-    return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0]
+    return np.sum(failed) / df.shape[0], df.shape[0]
 
 
 def labeledOutcomesEvaluator(df,
@@ -718,28 +631,20 @@ def labeledOutcomesEvaluator(df,
                              r,
                              adjusted=False):
 
-    train, test = train_test_split(df, test_size=0.5)
-
-    B_model, predictions = fitPredictiveModel(
-        train.loc[train[decisionT_col] == 1, featureX_col],
-        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
-        0)
+    df_observed = df.loc[df[decisionT_col] == 1, :]
 
-    test = test.assign(B_prob_0_model=predictions)
-
-    test_observed = test.loc[test[decisionT_col] == 1, :]
-
-    test_observed = test_observed.sort_values(by='B_prob_0_model',
+    df_observed = df_observed.sort_values(by='B_prob_0_model',
                                               inplace=False,
                                               ascending=True)
 
-    to_release = int(round(test_observed.shape[0] * r))
+    to_release = int(round(df_observed.shape[0] * r))
+    
+    failed = df_observed[resultY_col][0:to_release] == 0
 
     if adjusted:
-        return np.mean(test_observed[resultY_col][0:to_release] == 0)
+        return np.mean(failed), df.shape[0]
 
-    return np.sum(
-        test_observed[resultY_col][0:to_release] == 0) / test.shape[0]
+    return np.sum(failed) / df.shape[0], df.shape[0]
 
 
 def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
@@ -750,7 +655,7 @@ def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
 
     # No judges with correct leniency
     if np.sum(is_correct_leniency) == 0:
-        return np.nan
+        return np.nan, np.nan
 
     correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
 
@@ -758,24 +663,10 @@ def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
     released = df[df[judgeIDJ_col].isin(correct_leniency_list)
                   & (df[decisionT_col] == 1)]
 
+    failed = released[resultY_col] == 0
+    
     # Get their failure rate, aka ratio of reoffenders to number of people judged in total
-    return np.sum(released[resultY_col] == 0) / correct_leniency_list.shape[0]
-
-
-def causalEvaluator(df, featureX_col, decisionT_col, resultY_col, r):
-
-    train, test = train_test_split(df, test_size=0.5)
-
-    B_model, predictions = fitPredictiveModel(
-        train.loc[train[decisionT_col] == 1, featureX_col],
-        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
-        0)
-
-    test = test.assign(B_prob_0_model=predictions)
-
-    released = cdf(test[featureX_col], B_model, 0) < r
-
-    return np.mean(test.B_prob_0_model * released)
+    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
 
 
 def monteCarloEvaluator(df,
@@ -791,27 +682,17 @@ def monteCarloEvaluator(df,
                         sigma_X=1,
                         sigma_Z=1):
 
-    # Train the models and assign the predicted probabilities.
-    train, test = train_test_split(df, test_size=0.5)
-
-    B_model, predictions = fitPredictiveModel(
-        train.loc[train[decisionT_col] == 1, featureX_col],
-        train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col],
-        0)
-
-    test = test.assign(B_prob_0_model=predictions)
-
     # Compute the predicted/assumed decision bounds for all the judges.
-    q_r = inverse_cumulative(x=test[accRateR_col],
+    q_r = inverseCumulative(x=df[accRateR_col],
                              mu=mu_X + mu_Z,
                              sigma=np.sqrt((beta_X * sigma_X)**2 +
                                            (beta_Z * sigma_Z)**2))
 
-    test = test.assign(bounds=logit(q_r) - test[featureX_col])
+    df = df.assign(bounds=logit(q_r) - df[featureX_col])
 
     # Compute the expectation of Z when it is known to come from truncated
     # Gaussian.
-    alphabeta = (test.bounds - mu_Z) / (sigma_Z)
+    alphabeta = (df.bounds - mu_Z) / (sigma_Z)
 
     Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z)  # 1 - cdf(ab)
 
@@ -822,31 +703,27 @@ def monteCarloEvaluator(df,
     exp_upper_trunc = mu_Z - (
         sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta)
 
-    exp_Z = (1 - test[decisionT_col]
-             ) * exp_lower_trunc + test[decisionT_col] * exp_upper_trunc
+    exp_Z = (1 - df[decisionT_col]
+             ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc
 
     # Attach the predicted probability for Y=0 to data.
-    test = test.assign(predicted_Y=inv_logit(test[featureX_col] + exp_Z))
+    df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z))
 
-    # Predictions drawn from binomial. (Should fix this.)
-    predictions = npr.binomial(n=1, p=1 - test.predicted_Y, size=test.shape[0])
+    # Predictions drawn from binomial.
+    predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0])
 
-    test[resultY_col] = np.where(test[decisionT_col] == 0, predictions,
-                                 test[resultY_col])
+    df[resultY_col] = np.where(df[decisionT_col] == 0, predictions,
+                                 df[resultY_col])
 
-    test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(test.shape[0] * r))
-
-    return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0]
+    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
 
+    to_release = int(round(df.shape[0] * r))
+    
+    failed = df[resultY_col][0:to_release] == 0
 
-# ## Performance comparison
-# 
-# Below we try to replicate the results obtained by Lakkaraju and compare 
-# their model's performance to the one of ours.
+    return np.sum(failed) / df.shape[0], df.shape[0]
 
-def drawDiagnostics(title, save_name, save, f_rates, titles):
+def drawDiagnostics(title, save_name, f_rates, titles):
     
     cols = 2
     rows = np.ceil(len(f_rates) / cols)
@@ -874,394 +751,485 @@ def drawDiagnostics(title, save_name, save, f_rates, titles):
         plt.grid()
 
     plt.tight_layout()
-    plt.subplots_adjust(top=0.89)
+    plt.subplots_adjust(top=0.85)
     plt.suptitle(title, y=0.96, weight='bold')
 
-    if save:
-        plt.savefig(save_name + '_diagnostic_plot')
+    plt.savefig(save_name + '_diagnostic_plot')
 
     plt.show()
 
-
-def perfComp(dgModule, deciderModule, title, save_name, save=True, nIter=50):
-    failure_rates = np.zeros((8, 7))
-    failure_sems = np.zeros((8, 7))
+def perfComp(dgModule, deciderModule, title, save_name, model_type=None, 
+             fit_with_Z=False, nIter=10):
+    '''Performance comparison, a.k.a the main figure. 
+    
+    Parameters:
+    -----------
+        dgModule: Module creating the data
+        decisderModule: Module assigning the decisions
+        title: Title for the diagnostic figure
+        save_name: name for saving
+        model_type: what kind of model should be fitted. Logistic regression
+                    ("lr"), random forest ("rf") or fully random 
+                    ("fully_random")
+        fit_with_Z: should the predictive model be fit with Z (the latent 
+                    variable)
+        nIter: Number of train-test splits to be performed.
+        
+    '''
+    
+    failure_rates = np.zeros((8, 4))
+    failure_stds = np.zeros((8, 4))
 
     f_rate_true = np.zeros((nIter, 8))
     f_rate_label = np.zeros((nIter, 8))
-    f_rate_label_adj = np.zeros((nIter, 8))
-    f_rate_human = np.zeros((nIter, 8))
     f_rate_cont = np.zeros((nIter, 8))
-    f_rate_caus = np.zeros((nIter, 8))
+    f_rate_cf = np.zeros((nIter, 8))
 
     # Create data
     df = dgModule()
 
-    # Decicions
+    # Assign decicions
     df_labeled, df_unlabeled = deciderModule(df)
-
-    # Split data
-    train, test = train_test_split(df_labeled, test_size=0.5)
-
-    # Train model
-    B_model, predictions = fitPredictiveModel(
-        train.loc[train['decision_T'] == 1, 'X'],
-        train.loc[train['decision_T'] == 1, 'result_Y'], test['X'], 0)
-
-    test = test.assign(B_prob_0_model=predictions)
-
-    test.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    kk_array = pd.qcut(test['B_prob_0_model'], group_amount, labels=False)
-
-    # Find observed values
-    observed = test['decision_T'] == 1
-
-    # Assign data to the model
-    dat = dict(D=1,
-               N_obs=np.sum(observed),
-               N_cens=np.sum(~observed),
-               K=group_amount,
-               sigma_tau=sigma_tau,
-               M=len(set(df_unlabeled['judgeID_J'])),
-               jj_obs=test.loc[observed, 'judgeID_J']+1,
-               jj_cens=test.loc[~observed, 'judgeID_J']+1,
-               kk_obs=kk_array[observed]+1,
-               kk_cens=kk_array[~observed]+1,
-               dec_obs=test.loc[observed, 'decision_T'],
-               dec_cens=test.loc[~observed, 'decision_T'],
-               X_obs=test.loc[observed, 'B_prob_0_model'].values.reshape(-1,1),
-               X_cens=test.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1),
-               y_obs=test.loc[observed, 'result_Y'].astype(int))
-
-    fit = sm.sampling(data=dat, chains=5, iter=4000, control = dict(adapt_delta=0.9))
-
-    pars = fit.extract()
-
-    plt.figure(figsize=(15,30))
-
-    fit.plot();
-
-    if save:
-        plt.savefig(save_name + '_stan_diagnostic_plot')
     
-    plt.show()
-    plt.close('all')
-
-    if save:
-        print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
+    for i in range(nIter):
 
-    # Bayes
-    
-    # Alusta matriisi, rivillä yksi otos posteriorista
-    # sarakkeet havaintoja
-    y_imp = np.ones((pars['y_est'].shape[0], test.shape[0]))
+        # Split data
+        train, test_labeled = train_test_split(df_labeled, test_size=0.5)
+        
+        # Assign same observations to unlabeled dat
+        test_unlabeled = df_unlabeled.iloc[test_labeled.index.values]
+
+        # Train model
+        if fit_with_Z:
+            B_model, predictions = fitPredictiveModel(
+                    train.loc[train['decision_T'] == 1, ['X', 'Z']],
+                    train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[['X', 'Z']], 
+                    0, model_type=model_type)
     
-    # Täydennetään havaitsemattomat estimoiduilla
-    y_imp[:, ~observed] = 1-pars['y_est']
+        else:
+            B_model, predictions = fitPredictiveModel(
+                    train.loc[train['decision_T'] == 1, 'X'],
+                    train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], 
+                    0, model_type=model_type)
+
+        # Attach predictions to data
+        test_labeled = test_labeled.assign(B_prob_0_model=predictions)
+        test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
     
-    # Täydennetään havaitut havaituilla
-    y_imp[:, observed] = 1-test.loc[observed, 'result_Y']
-
-    Rs = np.arange(.1, .9, .1)
+        test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
     
-    to_release_list = np.round(test.shape[0] * Rs).astype(int)
+        # Assign group IDs
+        kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
     
-    f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan)
+        # Find observed values
+        observed = test_labeled['decision_T'] == 1
     
-    for i in range(len(to_release_list)):
-        est_failure_rates = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) / test.shape[0]
+        # Assign data to the model
         
-        f_rate_bayes[:, i] = est_failure_rates
+        ###############################
+        # Change this assignment if you want to use predictions as input to the
+        # model.
+        ###############################
+        dat = dict(D=1,
+                   N_obs=np.sum(observed),
+                   N_cens=np.sum(~observed),
+                   K=group_amount,
+                   sigma_tau=sigma_tau,
+                   M=len(set(df_unlabeled['judgeID_J'])),
+                   jj_obs=test_labeled.loc[observed, 'judgeID_J']+1,
+                   jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1,
+                   kk_obs=kk_array[observed]+1,
+                   kk_cens=kk_array[~observed]+1,
+                   dec_obs=test_labeled.loc[observed, 'decision_T'],
+                   dec_cens=test_labeled.loc[~observed, 'decision_T'],
+                   X_obs=test_labeled.loc[observed, 'X'].values.reshape(-1,1),
+                   X_cens=test_labeled.loc[~observed, 'X'].values.reshape(-1,1),
+                   y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
+
+        # Do the sampling - Change this if you wish to use other methods.
+        fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9))
+                
+        pars = fit.extract()
         
-        failure_rates[i, 6] = np.mean(est_failure_rates)    
-            
-    for i in range(nIter):
         
-        print(" [", i, "] ", sep='', end="")
-
-        for r in np.arange(1, 9):
+        
+        plt.figure(figsize=(15,30))
+    
+        fit.plot();
+    
+        plt.savefig(save_name + '_stan_convergence_diagnostic_plot_' + str(i))
+        
+        plt.show()
+        plt.close('all')
+        gc.collect()
+    
+        print(fit, file=open(save_name + '_stan_fit_diagnostics_' + str(i) + '.txt', 'w'))
 
+        # Bayes
+        
+        # Format matrix, each row is a sample of the posterior
+        # columns are observations
+                
+        y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
+        
+        # Impute the unobserved with the estimated 
+        # Reverse the binary coding to compute failures as sum. 0 == 0 = 1
+        y_imp[:, ~observed] = 1 - pars['y_est']
+        
+        # Remove the variables to conserve memory
+        del(pars)
+        del(fit)
+        
+        # Impute the observed
+        y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y']
+                            
+        for r in range(1, 9):
+            
+            to_release = np.round(test_labeled.shape[0] * (r / 10)).astype(int)
+                        
+            failed = np.sum(y_imp[:, 0:to_release], axis=1)
+            
+            f_rate_cf[i, r - 1] = np.mean(failed / test_labeled.shape[0])
+    
             print(".", end="")
-
+    
             # True evaluation
-
-            f_rate_true[i, r - 1] = trueEvaluationEvaluator(
-                df_unlabeled, 'X', 'decision_T', 'result_Y', r / 10)
-
+    
+            f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled,
+                       'X', 'decision_T', 'result_Y', r / 10)
+                
             # Labeled outcomes only
-
-            f_rate_label[i, r - 1] = labeledOutcomesEvaluator(
-                df_labeled, 'X', 'decision_T', 'result_Y', r / 10)
-
-            # Adjusted labeled outcomes
-
-            f_rate_label_adj[i, r - 1] = labeledOutcomesEvaluator(
-                df_labeled,
-                'X',
-                'decision_T',
-                'result_Y',
-                r / 10,
-                adjusted=True)
-
-            # Human evaluation
-
-            f_rate_human[i, r - 1] = humanEvaluationEvaluator(
-                df_labeled, 'judgeID_J', 'decision_T', 'result_Y',
-                'acceptanceRate_R', r / 10)
-
+    
+            f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled, 
+                        'X', 'decision_T', 'result_Y', r / 10)
+            
             # Contraction
-
-            f_rate_cont[i, r - 1] = contractionEvaluator(
-                df_labeled, 'X', 'judgeID_J', 'decision_T', 'result_Y',
-                'acceptanceRate_R', r / 10)
-
-            # Causal model - analytic solution
-
-            f_rate_caus[i, r - 1] = monteCarloEvaluator(
-                df_labeled, 'X', 'decision_T', 'result_Y', 'acceptanceRate_R',
-                r / 10)
-
+    
+            f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judgeID_J', 
+                       'decision_T', 'result_Y', 'B_prob_0_model', 
+                       'acceptanceRate_R', r / 10)
+            
     failure_rates[:, 0] = np.mean(f_rate_true, axis=0)
     failure_rates[:, 1] = np.mean(f_rate_label, axis=0)
-    failure_rates[:, 2] = np.mean(f_rate_label_adj, axis=0)
-    failure_rates[:, 3] = np.mean(f_rate_human, axis=0)
-    failure_rates[:, 4] = np.mean(f_rate_cont, axis=0)
-    failure_rates[:, 5] = np.mean(f_rate_caus, axis=0)
-    #failure_rates[:, 6] = f_rate_bayes
-
-    failure_sems[:, 0] = scs.sem(f_rate_true, axis=0)
-    failure_sems[:, 1] = scs.sem(f_rate_label, axis=0)
-    failure_sems[:, 2] = scs.sem(f_rate_label_adj, axis=0)
-    failure_sems[:, 3] = scs.sem(f_rate_human, axis=0)
-    failure_sems[:, 4] = scs.sem(f_rate_cont, axis=0)
-    failure_sems[:, 5] = scs.sem(f_rate_caus, axis=0)
-    failure_sems[:, 6] = scs.sem(f_rate_bayes, axis=0, nan_policy='omit')
+    failure_rates[:, 2] = np.mean(f_rate_cont, axis=0)
+    failure_rates[:, 3] = np.mean(f_rate_cf, axis=0)
+
+    failure_stds[:, 0] = np.std(f_rate_true, axis=0)
+    failure_stds[:, 1] = np.std(f_rate_label, axis=0)
+    failure_stds[:, 2] = np.std(f_rate_cont, axis=0)
+    failure_stds[:, 3] = np.std(f_rate_cf, axis=0)
 
     x_ax = np.arange(0.1, 0.9, 0.1)
 
-    labels = [
-        'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
-        'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes'
-    ]
-    colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c']
+    labels = ['True Evaluation', 'Labeled outcomes', 'Contraction', 
+              'Counterfactuals']
+    
+    colours = ['g', 'magenta', 'b', 'r']
+    
+    line_styles = ['--', ':', '-.', '-']
 
     for i in range(failure_rates.shape[1]):
         plt.errorbar(x_ax,
                      failure_rates[:, i],
                      label=labels[i],
                      c=colours[i],
-                     yerr=failure_sems[:, i])
+                     linestyle=line_styles[i],
+                     yerr=failure_stds[:, i])
 
-    plt.title('Failure rate vs. Acceptance rate')
+    #plt.title('Failure rate vs. Acceptance rate')
     plt.xlabel('Acceptance rate')
     plt.ylabel('Failure rate')
     plt.legend()
     plt.grid()
     
-    if save: 
-        plt.savefig(save_name + '_all')
+    plt.savefig(save_name + '_all')
     
     plt.show()
 
     print("\nFailure rates:")
     print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
     
+    print("\nStandard deviations:")
+    print(np.array2string(failure_stds, formatter={'float_kind':lambda x: "%.5f" % x}))
+
     print("\nMean absolute errors:")
     for i in range(1, failure_rates.shape[1]):
         print(
             labels[i].ljust(len(max(labels, key=len))),
             np.round(
                 np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
-
+        
     drawDiagnostics(title=title,
-                    save_name=save_name,
-                    save=save,
-                    f_rates=[
-                        f_rate_true, f_rate_label, f_rate_label_adj,
-                        f_rate_human, f_rate_cont, f_rate_caus, f_rate_bayes
-                    ],
-                    titles=labels)
-
+                save_name=save_name,
+                f_rates=[f_rate_true, f_rate_label, f_rate_cont, f_rate_cf],
+                titles=labels)
 
+# Compile stan model
 sm = pystan.StanModel(file=stan_code_file_name)
 
 if which == 1:
-    print("Without unobservables (Bernoulli + independent decisions)")
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: random and random")
 
-    dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim)
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
+    decider = lambda x: randomDecider(x, nJudges_M=M_sim, use_acceptance_rates=True)
 
     perfComp(
         dg, lambda x: decider(x),
         "Fluctuation of failure rate estimates across iterations\n" +
         "Bernoulli + independent decisions, without unobservables",
-        figure_path + "sl_bernoulli_independent_without_Z"
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="fully_random", 
+        fit_with_Z=False
     )
-
-gc.collect()
+    
 plt.close('all')
-
-print("With unobservables in the data")
+gc.collect()
 
 if which == 2:
-    print("\nBernoulli + independent decisions")
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: random and y ~ x")
 
     dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+    decider = lambda x: randomDecider(x, nJudges_M=M_sim, use_acceptance_rates=True)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Bernoulli + independent decisions, with unobservables",
-        figure_path + "sl_bernoulli_independent_with_Z",
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=False
     )
 
-gc.collect()
-plt.close('all')
-
 if which == 3:
-    print("\nThreshold rule + independent decisions")
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: random and y ~ x + z")
 
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+    decider = lambda x: randomDecider(x, nJudges_M=M_sim, use_acceptance_rates=True)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Threshold rule + independent decisions, with unobservables",
-        figure_path + "sl_threshold_independent_with_Z",
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=True
     )
 
-gc.collect()
-plt.close('all')
-
 if which == 4:
-    print("\nBernoulli + non-independent (batch) decisions")
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: y ~ x and random")
 
     dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Bernoulli + non-independent decisions, with unobservables",
-        figure_path + "sl_bernoulli_batch_with_Z",
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="fully_random", 
+        fit_with_Z=False
     )
 
-gc.collect()
-plt.close('all')
-
 if which == 5:
-    print("\nThreshold rule + non-independent (batch) decisions")
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: y ~ x and y ~ x")
 
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Threshold rule + non-independent decisions, with unobservables",
-        figure_path + "sl_threshold_batch_with_Z",
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=False
     )
-
-gc.collect()
-plt.close('all')
-
+    
 if which == 6:
-    print("\nRandom decider")
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: y ~ x and y ~ x + z")
 
     dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: randomDecider(
-        x, nJudges_M=M_sim, use_acceptance_rates=True)
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Bernoulli + random decider with leniency and unobservables",
-        figure_path + "sl_random_decider_with_Z",
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=True
     )
+    
+if which == 7:
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: y ~ x + z and random")
 
-gc.collect()
-plt.close('all')
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-if which == 7:
-    print("\nBiased decider")
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="fully_random", 
+        fit_with_Z=False
+    )
+
+if which == 8:
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: y ~ x + z and y ~ x")
 
     dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True)
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Bernoulli + biased decider with leniency and unobservables",
-        figure_path + "sl_biased_decider_with_Z",
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=False
     )
+    
+if which == 9:
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: y ~ x + z and y ~ x + z")
 
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-if which == 8:
-    print("\nBad judge")
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=True
+    )
+    
+if which == 10:
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: biased and random")
 
     dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim)
+    decider = lambda x: biasDecider(
+        x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Bernoulli + 'bad' decider with leniency and unobservables",
-        figure_path + "sl_bad_decider_with_Z"
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="fully_random", 
+        fit_with_Z=False
     )
 
-gc.collect()
-plt.close('all')
+if which == 11:
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: biased and y ~ x")
 
-if which == 9:
-    print("\nBernoulli + Bernoulli")
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
+
+    decider = lambda x: biasDecider(
+        x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1)
+
+    perfComp(
+        dg, lambda x: decider(x),
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=False
+    )
+
+if which == 12:
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Decision-maker in the data and model: biased and y ~ x + z")
 
     dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim)
+    decider = lambda x: biasDecider(
+        x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Bernoulli + Bernoulli",
-        figure_path + "sl_bernoulli_bernoulli_with_Z",
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=True
     )
     
-if which == 10:
-    print("\nBeta_Z = 3, Threshold + batch")
+if which == 13:
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Fewer subjects per decision-maker, from 500 to 100. Now N_total =", N_sim / 5)
 
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0)
+    dg = lambda: bernoulliDGWithUnobservables(N_total=int(N_sim / 5))
 
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True)
+    decider = lambda x: quantileDecider(
+        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Beta_Z = 3, threshold + batch",
-        figure_path + "sl_threshold_batch_beta_Z_3_with_Z",
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=True
     )
     
-if which == 11:
-    print("\nBeta_Z = 5, Threshold + batch")
+if which == 14:
+    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
+    
+    print("Now R ~ Uniform(0.1, 0.4)")
 
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0)
+    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
 
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True)
+    decider = lambda x: quantileDecider(x, featureX_col="X", featureZ_col=None, 
+                                        nJudges_M=M_sim, beta_X=1, beta_Z=1,
+                                        leniency_upper_limit=0.4)
 
     perfComp(
         dg, lambda x: decider(x),
-        "Beta_Z = 5, threshold + batch",
-        figure_path + "sl_threshold_batch_beta_Z_5_with_Z",
-    )
\ No newline at end of file
+        "Fluctuation of failure rate estimates across iterations\n" +
+        "Bernoulli + independent decisions, without unobservables",
+        figure_path + "sl_bernoulli_independent_without_Z",
+        model_type="lr", 
+        fit_with_Z=True
+    )
diff --git a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py b/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py
deleted file mode 100644
index 4ea26d0..0000000
--- a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py
+++ /dev/null
@@ -1,1101 +0,0 @@
-'''
-# Author: Riku Laine
-# Date: 25JUL2019 (start)
-# Project name: Potential outcomes in model evaluation
-# Description: This script creates the figures and results used 
-#              in synthetic data experiments.
-#
-# Parameters:
-# -----------
-# (1) figure_path : file name for saving the created figures.
-# (2) N_sim : Size of simulated data set.
-# (3) M_sim : Number of judges in simulated data, 
-#             N_sim must be divisible by M_sim!
-# (4) which : Which data + outcome analysis should be performed.
-# (5) group_amount : How many groups if Jung-inspired model is used.
-# (6) stan_code_file_name : Name of file containing the stan model code.
-# (7) sigma_tau : Values of prior variance for the Jung-inspired model.
-# (8) model_type : What model type to be fitted. Options:
-#                   - "lr" : logistic regression
-#                   - "rf" : random forest
-#                   - "fully_random" : Fully random, all predictions will be 0.5.
-#
-'''
-# Refer to the `notes.tex` file for explanations about the modular framework.
-
-# Imports
-
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import scipy.stats as scs
-import scipy.special as ssp
-import scipy.integrate as si
-import numpy.random as npr
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-import pystan
-import gc
-
-plt.switch_backend('agg')
-
-import sys
-
-# figure storage name
-figure_path = sys.argv[1]
-
-# Size of simulated data set
-N_sim = int(sys.argv[2])
-
-# Number of judges in simulated data, N_sim must be divisible by M_sim!
-M_sim = int(sys.argv[3])
-
-# Which data + outcome generation should be performed.
-which = int(sys.argv[4])
-
-# How many groups if jung model is used
-group_amount = int(sys.argv[5])
-
-# Name of stan model code file
-stan_code_file_name = sys.argv[6]
-
-# Variance prior
-sigma_tau = float(sys.argv[7])
-
-# Type of model to be fitted
-model_type = sys.argv[8]
-
-# Settings
-
-plt.rcParams.update({'font.size': 16})
-plt.rcParams.update({'figure.figsize': (10, 6)})
-
-print("These results have been obtained with the following settings:")
-
-print("Number of observations in the simulated data:", N_sim)
-
-print("Number of judges in the simulated data:", M_sim)
-
-print("Number of groups:", group_amount)
-
-print("Prior for the variances:", sigma_tau)
-
-# Basic functions
-
-
-def inverseLogit(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
-
-
-def logit(x):
-    return np.log(x) - np.log(1.0 - x)
-
-
-def inverseCumulative(x, mu, sigma):
-    '''Compute the inverse of the cumulative distribution of logit-normal
-    distribution at x with parameters mu and sigma (mean and st.dev.).'''
-
-    return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
-
-def standardError(p, n):
-    denominator = p * (1 - p)
-    
-    return np.sqrt(denominator / n)
-
-
-# ## Data generation modules
-
-def bernoulliDGWithoutUnobservables(N_total=50000):
-    '''Generates data | Variables: X, Y | Outcome from Bernoulli'''
-
-    df = pd.DataFrame()
-
-    # Sample feature X from standard Gaussian distribution, N(0, 1).
-    df = df.assign(X=npr.normal(size=N_total))
-
-    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
-    df = df.assign(probabilities_Y=inverseLogit(df.X))
-
-    # Draw Y ~ Bernoulli(1 - inverseLogit(X))
-    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X)
-    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
-
-    df = df.assign(result_Y=results)
-
-    return df
-
-
-def thresholdDGWithUnobservables(N_total=50000,
-                                 beta_X=1.0,
-                                 beta_Z=1.0,
-                                 beta_W=0.2):
-    '''Generates data | Variables: X, Z, W, Y | Outcome by threshold'''
-
-    df = pd.DataFrame()
-
-    # Sample the variables from standard Gaussian distributions.
-    df = df.assign(X=npr.normal(size=N_total))
-    df = df.assign(Z=npr.normal(size=N_total))
-    df = df.assign(W=npr.normal(size=N_total))
-
-    # Calculate P(Y=0|X, Z, W)
-    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
-
-    df = df.assign(probabilities_Y=probabilities_Y)
-
-    # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise
-    df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1))
-
-    return df
-
-
-def bernoulliDGWithUnobservables(N_total=50000,
-                                beta_X=1.0,
-                                beta_Z=1.0,
-                                beta_W=0.2):
-    '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli'''
-    
-    df = pd.DataFrame()
-
-    # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1).
-    df = df.assign(X=npr.normal(size=N_total))
-    df = df.assign(Z=npr.normal(size=N_total))
-    df = df.assign(W=npr.normal(size=N_total))
-
-    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
-    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
-
-    df = df.assign(probabilities_Y=probabilities_Y)
-
-    # Draw Y from Bernoulli distribution
-    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
-
-    df = df.assign(result_Y=results)
-
-    return df
-
-
-# ## Decider modules
-
-def humanDeciderLakkaraju(df,
-                          featureX_col,
-                          featureZ_col=None,
-                          nJudges_M=100,
-                          beta_X=1,
-                          beta_Z=1,
-                          add_epsilon=True):
-    '''Decider module | Non-independent batch decisions.'''
-
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Compute the number of subjects allocated for each judge.
-    nSubjects_N = int(df.shape[0] / nJudges_M)
-
-    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
-    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
-
-    # Sample acceptance rates uniformly from a closed interval
-    # from 0.1 to 0.9 and round to tenth decimal place.
-    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
-    #            values.
-    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
-    acceptance_rates = np.round(acceptance_rates, 10)
-
-    # Replicate the rates so they can be attached to the corresponding judge ID.
-    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
-
-    if add_epsilon:
-        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
-    else:
-        epsilon = 0
-    
-    if featureZ_col is None:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
-    else:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
-                                    beta_Z * df[featureZ_col] + epsilon)
-
-
-    df = df.assign(probabilities_T=probabilities_T)
-
-    # Sort by judges then probabilities in decreasing order
-    # Most dangerous for each judge are at the top.
-    df.sort_values(by=["judgeID_J", "probabilities_T"],
-                   ascending=False,
-                   inplace=True)
-
-    # Iterate over the data. Subject will be given a negative decision
-    # if they are in the top (1-r)*100% of the individuals the judge will judge.
-    # I.e. if their within-judge-index is under 1 - acceptance threshold times
-    # the number of subjects assigned to each judge they will receive a
-    # negative decision.
-    df.reset_index(drop=True, inplace=True)
-
-    df['decision_T'] = np.where((df.index.values % nSubjects_N) <
-                                ((1 - df['acceptanceRate_R']) * nSubjects_N),
-                                0, 1)
-
-    df_labeled = df.copy()
-
-    # Hide unobserved
-    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
-
-    return df_labeled, df
-
-
-def bernoulliDecider(df,
-                    featureX_col,
-                    featureZ_col=None,
-                    nJudges_M=100,
-                    beta_X=1,
-                    beta_Z=1,
-                    add_epsilon=True):
-    '''Use X and Z to make a decision with probability 
-    P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).'''
-
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Compute the number of subjects allocated for each judge.
-    nSubjects_N = int(df.shape[0] / nJudges_M)
-
-    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
-    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
-
-    if add_epsilon:
-        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
-    else:
-        epsilon = 0
-    
-    if featureZ_col is None:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
-    else:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
-                                    beta_Z * df[featureZ_col] + epsilon)
-
-    df = df.assign(probabilities_T=probabilities_T)
-
-    # Draw T from Bernoulli distribution
-    decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0])
-
-    df = df.assign(decision_T=decisions)
-
-    # Calculate the acceptance rates.
-    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values
-
-    # Replicate the rates so they can be attached to the corresponding judge ID.
-    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
-
-    df_labeled = df.copy()
-
-    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
-
-    return df_labeled, df
-
-
-def quantileDecider(df,
-                    featureX_col,
-                    featureZ_col=None,
-                    nJudges_M=100,
-                    beta_X=1,
-                    beta_Z=1,
-                    add_epsilon=True):
-    '''Assign decisions by the value of inverse cumulative distribution function
-    of the logit-normal distribution at leniency r.'''
-    
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Compute the number of subjects allocated for each judge.
-    nSubjects_N = int(df.shape[0] / nJudges_M)
-
-    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
-    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
-
-    # Sample acceptance rates uniformly from a closed interval
-    # from 0.1 to 0.9 and round to tenth decimal place.
-    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
-    #            values.
-    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
-    acceptance_rates = np.round(acceptance_rates, 10)
-
-    # Replicate the rates so they can be attached to the corresponding judge ID.
-    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
-
-    if add_epsilon:
-        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
-    else:
-        epsilon = 0
-    
-    if featureZ_col is None:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
-
-        # Compute the bounds straight from the inverse cumulative.
-        # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2.
-        df = df.assign(bounds=inverseCumulative(
-            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2)))
-    else:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
-                                    beta_Z * df[featureZ_col] + epsilon)
-
-        # Compute the bounds straight from the inverse cumulative.
-        # Assuming X and Z are i.i.d standard Gaussians with variance 1.
-        # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z).
-        df = df.assign(bounds=inverseCumulative(
-            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2)))
-
-    df = df.assign(probabilities_T=probabilities_T)
-
-    # Assign negative decision if the predicted probability (probabilities_T) is
-    # over the judge's threshold (bounds).
-    df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1))
-
-    df_labeled = df.copy()
-
-    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
-
-    return df_labeled, df
-
-
-def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
-    '''Doesn't use any information about X and Z to make decisions.
-    
-    If use_acceptance_rates is False (default) then all decisions are positive
-    with probability 0.5. If True, probabilities will be sampled from 
-    U(0.1, 0.9) and rounded to tenth decimal place.'''
-
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Compute the number of subjects allocated for each judge.
-    nSubjects_N = int(df.shape[0] / nJudges_M)
-
-    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
-    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
-
-    if use_acceptance_rates:
-        # Sample acceptance rates uniformly from a closed interval
-        # from 0.1 to 0.9 and round to tenth decimal place.
-        acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10)
-    else:
-        # No real leniency here -> set to 0.5.
-        acceptance_rates = np.ones(nJudges_M) * 0.5
-
-    # Replicate the rates so they can be attached to the corresponding judge ID.
-    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
-
-    df = df.assign(
-        decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0]))
-
-    df_labeled = df.copy()
-
-    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
-
-    return df_labeled, df
-
-
-def biasDecider(df,
-                featureX_col,
-                featureZ_col=None,
-                nJudges_M=100,
-                beta_X=1,
-                beta_Z=1,
-                add_epsilon=True):
-    '''
-    Biased decider: If X > 1, then X <- X * 0.75. People with high X, 
-    get more positive decisions as they should. And if -2 < X -1, then 
-    X <- X + 0.5. People with X in [-2, 1], get less positive decisions 
-    as they should.
-    
-    '''
-
-    # If X > 1, then X <- X * 0.75. People with high X, get more positive
-    # decisions as they should
-    df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] *
-                                     0.75, df[featureX_col]))
-
-    # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less
-    # positive decisions as they should
-    df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1,
-                           df.biased_X + 0.5, df.biased_X)
-
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Use quantile decider, but judge by the biased X.
-    df_labeled, df = humanDeciderLakkaraju(df,
-                                     featureX_col='biased_X',
-                                     featureZ_col=featureZ_col,
-                                     nJudges_M=nJudges_M,
-                                     beta_X=beta_X,
-                                     beta_Z=beta_Z,
-                                     add_epsilon=add_epsilon)
-
-    return df_labeled, df
-
-
-# ## Evaluator modules
-
-# ### Convenience functions
-
-def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
-    '''
-    Fit a predictive model (default logistic regression) with given training 
-    instances and return probabilities for test instances to obtain a given 
-    class label.
-    
-    Arguments:
-    ----------
-    
-    x_train -- x values of training instances
-    y_train -- y values of training instances
-    x_test -- x values of test instances
-    class_value -- class label for which the probabilities are counted for.
-    model_type -- type of model to be fitted.
-    
-    Returns:
-    --------
-    (1) Trained predictive model
-    (2) Probabilities for given test inputs for given class.
-    '''
-
-    if model_type is None or model_type in ["logistic_regression", "lr"]:
-        # Instantiate the model (using the default parameters)
-        logreg = LogisticRegression(solver='lbfgs')
-
-        # Check shape and fit the model.
-        if x_train.ndim == 1:
-            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
-        else:
-            logreg = logreg.fit(x_train, y_train)
-
-        label_probs_logreg = getProbabilityForClass(x_test, logreg,
-                                                    class_value)
-
-        return logreg, label_probs_logreg
-
-    elif model_type in ["random_forest", "rf"]:
-        # Instantiate the model
-        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
-
-        # Check shape and fit the model.
-        if x_train.ndim == 1:
-            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
-        else:
-            forest = forest.fit(x_train, y_train)
-
-        label_probs_forest = getProbabilityForClass(x_test, forest,
-                                                    class_value)
-
-        return forest, label_probs_forest
-
-    elif model_type == "fully_random":
-
-        label_probs = np.ones_like(x_test) / 2
-
-        model_object = lambda x: 0.5
-
-        return model_object, label_probs
-    else:
-        raise ValueError("Invalid model_type!", model_type)
-
-
-def getProbabilityForClass(x, model, class_value):
-    '''
-    Function (wrapper) for obtaining the probability of a class given x and a 
-    predictive model.
-
-    Arguments:
-    -----------
-    x -- individual features, an array of shape (observations, features)
-    model -- a trained sklearn model. Predicts probabilities for given x. 
-        Should accept input of shape (observations, features)
-    class_value -- the resulting class to predict (usually 0 or 1).
-
-    Returns:
-    --------
-    (1) The probabilities of given class label for each x.
-    '''
-    if x.ndim == 1:
-        # if x is vector, transform to column matrix.
-        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
-    else:
-        f_values = model.predict_proba(x)
-
-    # Get correct column of predicted class, remove extra dimensions and return.
-    return f_values[:, model.classes_ == class_value].flatten()
-
-
-def cdf(x_0, model, class_value):
-    '''
-    Cumulative distribution function as described above. Integral is 
-    approximated using Simpson's rule for efficiency.
-    
-    Arguments:
-    ----------
-    
-    x_0 -- private features of an instance for which the value of cdf is to be
-        calculated.
-    model -- a trained sklearn model. Predicts probabilities for given x. 
-        Should accept input of shape (observations, features)
-    class_value -- the resulting class to predict (usually 0 or 1).
-
-    '''
-
-    def prediction(x):
-        return getProbabilityForClass(
-            np.array([x]).reshape(-1, 1), model, class_value)
-
-    prediction_x_0 = prediction(x_0)
-
-    x_values = np.linspace(-15, 15, 40000)
-
-    x_preds = prediction(x_values)
-
-    y_values = scs.norm.pdf(x_values)
-
-    results = np.zeros(x_0.shape[0])
-
-    for i in range(x_0.shape[0]):
-
-        y_copy = y_values.copy()
-
-        y_copy[x_preds > prediction_x_0[i]] = 0
-
-        results[i] = si.simps(y_copy, x=x_values)
-
-    return results
-
-# ### Contraction algorithm
-# 
-# Below is an implementation of Lakkaraju's team's algorithm presented in 
-# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
-# parameters to be passed to the function are presented in the description.
-
-def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
-                accRateR_col, r):
-    '''
-    This is an implementation of the algorithm presented by Lakkaraju
-    et al. in their paper "The Selective Labels Problem: Evaluating 
-    Algorithmic Predictions in the Presence of Unobservables" (2017).
-
-    Arguments:
-    ----------
-    df -- The (Pandas) data frame containing the data, judge decisions,
-        judge IDs, results and probability scores.
-    judgeIDJ_col -- String, the name of the column containing the judges' IDs
-        in df.
-    decisionT_col -- String, the name of the column containing the judges' decisions
-    resultY_col -- String, the name of the column containing the realization
-    modelProbS_col -- String, the name of the column containing the probability
-        scores from the black-box model B.
-    accRateR_col -- String, the name of the column containing the judges' 
-        acceptance rates
-    r -- Float between 0 and 1, the given acceptance rate.
-
-    Returns:
-    --------
-    (1) The estimated failure rate at acceptance rate r.
-    '''
-    # Get ID of the most lenient judge.
-    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
-
-    # Subset. "D_q is the set of all observations judged by q."
-    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
-
-    # All observations of R_q have observed outcome labels.
-    # "R_q is the set of observations in D_q with observed outcome labels."
-    R_q = D_q[D_q[decisionT_col] == 1].copy()
-
-    # Sort observations in R_q in descending order of confidence scores S and
-    # assign to R_sort_q.
-    # "Observations deemed as high risk by B are at the top of this list"
-    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
-
-    number_to_remove = int(
-        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
-
-    # "R_B is the list of observations assigned to t = 1 by B"
-    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
-
-    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
-
-
-# ### Evaluators
-
-def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r):
-
-    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(df.shape[0] * r))
-    
-    failed = df[resultY_col][0:to_release] == 0
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-def labeledOutcomesEvaluator(df,
-                             featureX_col,
-                             decisionT_col,
-                             resultY_col,
-                             r,
-                             adjusted=False):
-
-    df_observed = df.loc[df[decisionT_col] == 1, :]
-
-    df_observed = df_observed.sort_values(by='B_prob_0_model',
-                                              inplace=False,
-                                              ascending=True)
-
-    to_release = int(round(df_observed.shape[0] * r))
-    
-    failed = df_observed[resultY_col][0:to_release] == 0
-
-    if adjusted:
-        return np.mean(failed), df.shape[0]
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
-                             accRateR_col, r):
-
-    # Get judges with correct leniency as list
-    is_correct_leniency = df[accRateR_col].round(1) == r
-
-    # No judges with correct leniency
-    if np.sum(is_correct_leniency) == 0:
-        return np.nan, np.nan
-
-    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
-
-    # Released are the people they judged and released, T = 1
-    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
-                  & (df[decisionT_col] == 1)]
-
-    failed = released[resultY_col] == 0
-    
-    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
-    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
-
-
-def monteCarloEvaluator(df,
-                        featureX_col,
-                        decisionT_col,
-                        resultY_col,
-                        accRateR_col,
-                        r,
-                        mu_X=0,
-                        mu_Z=0,
-                        beta_X=1,
-                        beta_Z=1,
-                        sigma_X=1,
-                        sigma_Z=1):
-
-    # Compute the predicted/assumed decision bounds for all the judges.
-    q_r = inverseCumulative(x=df[accRateR_col],
-                             mu=mu_X + mu_Z,
-                             sigma=np.sqrt((beta_X * sigma_X)**2 +
-                                           (beta_Z * sigma_Z)**2))
-
-    df = df.assign(bounds=logit(q_r) - df[featureX_col])
-
-    # Compute the expectation of Z when it is known to come from truncated
-    # Gaussian.
-    alphabeta = (df.bounds - mu_Z) / (sigma_Z)
-
-    Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z)  # 1 - cdf(ab)
-
-    # E(Z | Z > a). Expectation of Z if negative decision.
-    exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_
-
-    # E(Z | Z < b). Expectation of Z if positive decision.
-    exp_upper_trunc = mu_Z - (
-        sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta)
-
-    exp_Z = (1 - df[decisionT_col]
-             ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc
-
-    # Attach the predicted probability for Y=0 to data.
-    df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z))
-
-    # Predictions drawn from binomial.
-    predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0])
-
-    df[resultY_col] = np.where(df[decisionT_col] == 0, predictions,
-                                 df[resultY_col])
-
-    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(df.shape[0] * r))
-    
-    failed = df[resultY_col][0:to_release] == 0
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-
-def perfComp(dgModule, deciderModule, title, save_name):
-    failure_rates = np.zeros((8, 7))
-    error_Ns = np.zeros((8, 7))
-
-    # Create data
-    df = dgModule()
-
-    # Decicions
-    df_labeled, df_unlabeled = deciderModule(df)
-
-    # Split data
-    train, test_labeled = train_test_split(df_labeled, test_size=0.5)
-    
-    # Assign same observations to unlabeled dat
-    test_unlabeled = df_unlabeled.iloc[test_labeled.index.values]
-
-    # Train model
-    B_model, predictions = fitPredictiveModel(
-        train.loc[train['decision_T'] == 1, 'X'],
-        train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], 0, model_type=model_type)
-
-    # Attach predictions to data
-    test_labeled = test_labeled.assign(B_prob_0_model=predictions)
-    test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
-
-    test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
-
-    # Find observed values
-    observed = test_labeled['decision_T'] == 1
-
-    # Assign data to the model
-    dat = dict(D=1,
-               N_obs=np.sum(observed),
-               N_cens=np.sum(~observed),
-               K=group_amount,
-               sigma_tau=sigma_tau,
-               M=len(set(df_unlabeled['judgeID_J'])),
-               jj_obs=test_labeled.loc[observed, 'judgeID_J']+1,
-               jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1,
-               kk_obs=kk_array[observed]+1,
-               kk_cens=kk_array[~observed]+1,
-               dec_obs=test_labeled.loc[observed, 'decision_T'],
-               dec_cens=test_labeled.loc[~observed, 'decision_T'],
-               X_obs=test_labeled.loc[observed, 'X'].values.reshape(-1,1),
-               X_cens=test_labeled.loc[~observed, 'X'].values.reshape(-1,1),
-               y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
-
-    fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9))
-
-    pars = fit.extract()
-
-    plt.figure(figsize=(15,30))
-
-    fit.plot();
-
-    plt.savefig(save_name + '_stan_diagnostic_plot')
-    
-    plt.show()
-    plt.close('all')
-
-    print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
-
-    # Bayes
-    
-    # Alusta matriisi, rivillä yksi otos posteriorista
-    # sarakkeet havaintoja
-    y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
-    
-    # Täydennetään havaitsemattomat estimoiduilla
-    y_imp[:, ~observed] = 1 - pars['y_est']
-    
-    # Täydennetään havaitut havaituilla
-    y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y']
-
-    Rs = np.arange(.1, .9, .1)
-    
-    to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int)
-        
-    for i in range(len(to_release_list)):
-        
-        failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1)
-        
-        est_failure_rates =  failed / test_labeled.shape[0]
-                
-        failure_rates[i, 6] = np.mean(est_failure_rates)
-        
-        error_Ns[i, 6] = test_labeled.shape[0]
-                    
-    for r in range(1, 9):
-
-        print(".", end="")
-
-        # True evaluation
-
-        FR, N = trueEvaluationEvaluator(test_unlabeled, 'X', 'decision_T', 
-                                        'result_Y', r / 10)
-        
-        failure_rates[r - 1, 0] = FR
-        error_Ns[r - 1, 0] = N
-
-        # Labeled outcomes only
-
-        FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T', 
-                                         'result_Y', r / 10)
-
-        failure_rates[r - 1, 1] = FR
-        error_Ns[r - 1, 1] = N
-
-        # Adjusted labeled outcomes
-
-        FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T',
-                                         'result_Y', r / 10, adjusted=True)
-
-        failure_rates[r - 1, 2] = FR
-        error_Ns[r - 1, 2] = N
-
-        # Human evaluation
-
-        FR, N = humanEvaluationEvaluator(test_labeled, 'judgeID_J', 
-                                         'decision_T', 'result_Y', 
-                                         'acceptanceRate_R', r / 10)
-
-        failure_rates[r - 1, 3] = FR
-        error_Ns[r - 1, 3] = N
-
-        # Contraction
-
-        FR, N = contraction(test_labeled, 'judgeID_J', 'decision_T',  
-                            'result_Y', 'B_prob_0_model', 'acceptanceRate_R', r / 10)
-
-        failure_rates[r - 1, 4] = FR
-        error_Ns[r - 1, 4] = N
-
-        # Causal model - analytic solution
-
-        FR, N = monteCarloEvaluator(test_labeled, 'X', 'decision_T', 
-                                    'result_Y', 'acceptanceRate_R', r / 10)
-
-        failure_rates[r - 1, 5] = FR
-        error_Ns[r - 1, 5] = N
-        
-    failure_SEs = standardError(failure_rates, error_Ns)
-
-    x_ax = np.arange(0.1, 0.9, 0.1)
-
-    labels = [
-        'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
-        'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes'
-    ]
-    colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c']
-
-    for i in range(failure_rates.shape[1]):
-        plt.errorbar(x_ax,
-                     failure_rates[:, i],
-                     label=labels[i],
-                     c=colours[i],
-                     yerr=failure_SEs[:, i])
-
-    plt.title('Failure rate vs. Acceptance rate')
-    plt.xlabel('Acceptance rate')
-    plt.ylabel('Failure rate')
-    plt.legend()
-    plt.grid()
-    
-    plt.savefig(save_name + '_all')
-    
-    plt.show()
-
-    print("\nFailure rates:")
-    print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
-    
-    print("\nMean absolute errors:")
-    for i in range(1, failure_rates.shape[1]):
-        print(
-            labels[i].ljust(len(max(labels, key=len))),
-            np.round(
-                np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
-
-
-sm = pystan.StanModel(file=stan_code_file_name)
-
-if which == 1:
-    print("\nWithout unobservables (Bernoulli + independent decisions)")
-
-    dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim)
-
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations\n" +
-        "Bernoulli + independent decisions, without unobservables",
-        figure_path + "sl_bernoulli_independent_without_Z"
-    )
-
-gc.collect()
-plt.close('all')
-
-print("\nWith unobservables in the data")
-
-if which == 2:
-    print("\nBernoulli + independent decisions")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Bernoulli + independent decisions, with unobservables",
-        figure_path + "sl_bernoulli_independent_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 3:
-    print("\nThreshold rule + independent decisions")
-
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Threshold rule + independent decisions, with unobservables",
-        figure_path + "sl_threshold_independent_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 4:
-    print("\nBernoulli + non-independent (batch) decisions")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Bernoulli + non-independent decisions, with unobservables",
-        figure_path + "sl_bernoulli_batch_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 5:
-    print("\nThreshold rule + non-independent (batch) decisions")
-
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Threshold rule + non-independent decisions, with unobservables",
-        figure_path + "sl_threshold_batch_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 6:
-    print("\nRandom decider")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: randomDecider(
-        x, nJudges_M=M_sim, use_acceptance_rates=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Bernoulli + random decider with leniency and unobservables",
-        figure_path + "sl_random_decider_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 7:
-    print("\nBiased decider")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Bernoulli + biased decider with leniency and unobservables",
-        figure_path + "sl_biased_decider_with_Z",
-    )
-
-
-if which == 8:
-    print("\nBad judge")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Bernoulli + 'bad' decider with leniency and unobservables",
-        figure_path + "sl_bad_decider_with_Z"
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 9:
-    print("\nBernoulli + Bernoulli")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Bernoulli + Bernoulli",
-        figure_path + "sl_bernoulli_bernoulli_with_Z",
-    )
-    
-if which == 10:
-    print("\nBeta_Z = 3, Threshold + batch")
-
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0)
-
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Beta_Z = 3, threshold + batch",
-        figure_path + "sl_threshold_batch_beta_Z_3_with_Z",
-    )
-    
-if which == 11:
-    print("\nBeta_Z = 5, Threshold + batch")
-
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0)
-
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Beta_Z = 5, threshold + batch",
-        figure_path + "sl_threshold_batch_beta_Z_5_with_Z",
-    )
\ No newline at end of file
diff --git a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py b/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py
deleted file mode 100644
index 00711de..0000000
--- a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py
+++ /dev/null
@@ -1,1101 +0,0 @@
-'''
-# Author: Riku Laine
-# Date: 25JUL2019 (start)
-# Project name: Potential outcomes in model evaluation
-# Description: This script creates the figures and results used 
-#              in synthetic data experiments.
-#
-# Parameters:
-# -----------
-# (1) figure_path : file name for saving the created figures.
-# (2) N_sim : Size of simulated data set.
-# (3) M_sim : Number of judges in simulated data, 
-#             N_sim must be divisible by M_sim!
-# (4) which : Which data + outcome analysis should be performed.
-# (5) group_amount : How many groups if Jung-inspired model is used.
-# (6) stan_code_file_name : Name of file containing the stan model code.
-# (7) sigma_tau : Values of prior variance for the Jung-inspired model.
-# (8) model_type : What model type to be fitted. Options:
-#                   - "lr" : logistic regression
-#                   - "rf" : random forest
-#                   - "fully_random" : Fully random, all predictions will be 0.5.
-#
-'''
-# Refer to the `notes.tex` file for explanations about the modular framework.
-
-# Imports
-
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import scipy.stats as scs
-import scipy.special as ssp
-import scipy.integrate as si
-import numpy.random as npr
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-import pystan
-import gc
-
-plt.switch_backend('agg')
-
-import sys
-
-# figure storage name
-figure_path = sys.argv[1]
-
-# Size of simulated data set
-N_sim = int(sys.argv[2])
-
-# Number of judges in simulated data, N_sim must be divisible by M_sim!
-M_sim = int(sys.argv[3])
-
-# Which data + outcome generation should be performed.
-which = int(sys.argv[4])
-
-# How many groups if jung model is used
-group_amount = int(sys.argv[5])
-
-# Name of stan model code file
-stan_code_file_name = sys.argv[6]
-
-# Variance prior
-sigma_tau = float(sys.argv[7])
-
-# Type of model to be fitted
-model_type = sys.argv[8]
-
-# Settings
-
-plt.rcParams.update({'font.size': 16})
-plt.rcParams.update({'figure.figsize': (10, 6)})
-
-print("These results have been obtained with the following settings:")
-
-print("Number of observations in the simulated data:", N_sim)
-
-print("Number of judges in the simulated data:", M_sim)
-
-print("Number of groups:", group_amount)
-
-print("Prior for the variances:", sigma_tau)
-
-# Basic functions
-
-
-def inverseLogit(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
-
-
-def logit(x):
-    return np.log(x) - np.log(1.0 - x)
-
-
-def inverseCumulative(x, mu, sigma):
-    '''Compute the inverse of the cumulative distribution of logit-normal
-    distribution at x with parameters mu and sigma (mean and st.dev.).'''
-
-    return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)
-
-def standardError(p, n):
-    denominator = p * (1 - p)
-    
-    return np.sqrt(denominator / n)
-
-
-# ## Data generation modules
-
-def bernoulliDGWithoutUnobservables(N_total=50000):
-    '''Generates data | Variables: X, Y | Outcome from Bernoulli'''
-
-    df = pd.DataFrame()
-
-    # Sample feature X from standard Gaussian distribution, N(0, 1).
-    df = df.assign(X=npr.normal(size=N_total))
-
-    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
-    df = df.assign(probabilities_Y=inverseLogit(df.X))
-
-    # Draw Y ~ Bernoulli(1 - inverseLogit(X))
-    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X)
-    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
-
-    df = df.assign(result_Y=results)
-
-    return df
-
-
-def thresholdDGWithUnobservables(N_total=50000,
-                                 beta_X=1.0,
-                                 beta_Z=1.0,
-                                 beta_W=0.2):
-    '''Generates data | Variables: X, Z, W, Y | Outcome by threshold'''
-
-    df = pd.DataFrame()
-
-    # Sample the variables from standard Gaussian distributions.
-    df = df.assign(X=npr.normal(size=N_total))
-    df = df.assign(Z=npr.normal(size=N_total))
-    df = df.assign(W=npr.normal(size=N_total))
-
-    # Calculate P(Y=0|X, Z, W)
-    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
-
-    df = df.assign(probabilities_Y=probabilities_Y)
-
-    # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise
-    df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1))
-
-    return df
-
-
-def bernoulliDGWithUnobservables(N_total=50000,
-                                beta_X=1.0,
-                                beta_Z=1.0,
-                                beta_W=0.2):
-    '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli'''
-    
-    df = pd.DataFrame()
-
-    # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1).
-    df = df.assign(X=npr.normal(size=N_total))
-    df = df.assign(Z=npr.normal(size=N_total))
-    df = df.assign(W=npr.normal(size=N_total))
-
-    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
-    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)
-
-    df = df.assign(probabilities_Y=probabilities_Y)
-
-    # Draw Y from Bernoulli distribution
-    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)
-
-    df = df.assign(result_Y=results)
-
-    return df
-
-
-# ## Decider modules
-
-def humanDeciderLakkaraju(df,
-                          featureX_col,
-                          featureZ_col=None,
-                          nJudges_M=100,
-                          beta_X=1,
-                          beta_Z=1,
-                          add_epsilon=True):
-    '''Decider module | Non-independent batch decisions.'''
-
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Compute the number of subjects allocated for each judge.
-    nSubjects_N = int(df.shape[0] / nJudges_M)
-
-    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
-    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
-
-    # Sample acceptance rates uniformly from a closed interval
-    # from 0.1 to 0.9 and round to tenth decimal place.
-    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
-    #            values.
-    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
-    acceptance_rates = np.round(acceptance_rates, 10)
-
-    # Replicate the rates so they can be attached to the corresponding judge ID.
-    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
-
-    if add_epsilon:
-        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
-    else:
-        epsilon = 0
-    
-    if featureZ_col is None:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
-    else:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
-                                    beta_Z * df[featureZ_col] + epsilon)
-
-
-    df = df.assign(probabilities_T=probabilities_T)
-
-    # Sort by judges then probabilities in decreasing order
-    # Most dangerous for each judge are at the top.
-    df.sort_values(by=["judgeID_J", "probabilities_T"],
-                   ascending=False,
-                   inplace=True)
-
-    # Iterate over the data. Subject will be given a negative decision
-    # if they are in the top (1-r)*100% of the individuals the judge will judge.
-    # I.e. if their within-judge-index is under 1 - acceptance threshold times
-    # the number of subjects assigned to each judge they will receive a
-    # negative decision.
-    df.reset_index(drop=True, inplace=True)
-
-    df['decision_T'] = np.where((df.index.values % nSubjects_N) <
-                                ((1 - df['acceptanceRate_R']) * nSubjects_N),
-                                0, 1)
-
-    df_labeled = df.copy()
-
-    # Hide unobserved
-    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
-
-    return df_labeled, df
-
-
-def bernoulliDecider(df,
-                    featureX_col,
-                    featureZ_col=None,
-                    nJudges_M=100,
-                    beta_X=1,
-                    beta_Z=1,
-                    add_epsilon=True):
-    '''Use X and Z to make a decision with probability 
-    P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).'''
-
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Compute the number of subjects allocated for each judge.
-    nSubjects_N = int(df.shape[0] / nJudges_M)
-
-    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
-    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
-
-    if add_epsilon:
-        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
-    else:
-        epsilon = 0
-    
-    if featureZ_col is None:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
-    else:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
-                                    beta_Z * df[featureZ_col] + epsilon)
-
-    df = df.assign(probabilities_T=probabilities_T)
-
-    # Draw T from Bernoulli distribution
-    decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0])
-
-    df = df.assign(decision_T=decisions)
-
-    # Calculate the acceptance rates.
-    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values
-
-    # Replicate the rates so they can be attached to the corresponding judge ID.
-    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
-
-    df_labeled = df.copy()
-
-    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
-
-    return df_labeled, df
-
-
-def quantileDecider(df,
-                    featureX_col,
-                    featureZ_col=None,
-                    nJudges_M=100,
-                    beta_X=1,
-                    beta_Z=1,
-                    add_epsilon=True):
-    '''Assign decisions by the value of inverse cumulative distribution function
-    of the logit-normal distribution at leniency r.'''
-    
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Compute the number of subjects allocated for each judge.
-    nSubjects_N = int(df.shape[0] / nJudges_M)
-
-    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
-    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
-
-    # Sample acceptance rates uniformly from a closed interval
-    # from 0.1 to 0.9 and round to tenth decimal place.
-    # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
-    #            values.
-    acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
-    acceptance_rates = np.round(acceptance_rates, 10)
-
-    # Replicate the rates so they can be attached to the corresponding judge ID.
-    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
-
-    if add_epsilon:
-        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
-    else:
-        epsilon = 0
-    
-    if featureZ_col is None:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
-
-        # Compute the bounds straight from the inverse cumulative.
-        # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2.
-        df = df.assign(bounds=inverseCumulative(
-            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2)))
-    else:
-        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
-                                    beta_Z * df[featureZ_col] + epsilon)
-
-        # Compute the bounds straight from the inverse cumulative.
-        # Assuming X and Z are i.i.d standard Gaussians with variance 1.
-        # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z).
-        df = df.assign(bounds=inverseCumulative(
-            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2)))
-
-    df = df.assign(probabilities_T=probabilities_T)
-
-    # Assign negative decision if the predicted probability (probabilities_T) is
-    # over the judge's threshold (bounds).
-    df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1))
-
-    df_labeled = df.copy()
-
-    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
-
-    return df_labeled, df
-
-
-def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
-    '''Doesn't use any information about X and Z to make decisions.
-    
-    If use_acceptance_rates is False (default) then all decisions are positive
-    with probability 0.5. If True, probabilities will be sampled from 
-    U(0.1, 0.9) and rounded to tenth decimal place.'''
-
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Compute the number of subjects allocated for each judge.
-    nSubjects_N = int(df.shape[0] / nJudges_M)
-
-    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
-    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))
-
-    if use_acceptance_rates:
-        # Sample acceptance rates uniformly from a closed interval
-        # from 0.1 to 0.9 and round to tenth decimal place.
-        acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10)
-    else:
-        # No real leniency here -> set to 0.5.
-        acceptance_rates = np.ones(nJudges_M) * 0.5
-
-    # Replicate the rates so they can be attached to the corresponding judge ID.
-    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))
-
-    df = df.assign(
-        decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0]))
-
-    df_labeled = df.copy()
-
-    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan
-
-    return df_labeled, df
-
-
-def biasDecider(df,
-                featureX_col,
-                featureZ_col=None,
-                nJudges_M=100,
-                beta_X=1,
-                beta_Z=1,
-                add_epsilon=True):
-    '''
-    Biased decider: If X > 1, then X <- X * 0.75. People with high X, 
-    get more positive decisions as they should. And if -2 < X -1, then 
-    X <- X + 0.5. People with X in [-2, 1], get less positive decisions 
-    as they should.
-    
-    '''
-
-    # If X > 1, then X <- X * 0.75. People with high X, get more positive
-    # decisions as they should
-    df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] *
-                                     0.75, df[featureX_col]))
-
-    # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less
-    # positive decisions as they should
-    df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1,
-                           df.biased_X + 0.5, df.biased_X)
-
-    # Assert that every judge will have the same number of subjects.
-    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"
-
-    # Use quantile decider, but judge by the biased X.
-    df_labeled, df = humanDeciderLakkaraju(df,
-                                     featureX_col='biased_X',
-                                     featureZ_col=featureZ_col,
-                                     nJudges_M=nJudges_M,
-                                     beta_X=beta_X,
-                                     beta_Z=beta_Z,
-                                     add_epsilon=add_epsilon)
-
-    return df_labeled, df
-
-
-# ## Evaluator modules
-
-# ### Convenience functions
-
-def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
-    '''
-    Fit a predictive model (default logistic regression) with given training 
-    instances and return probabilities for test instances to obtain a given 
-    class label.
-    
-    Arguments:
-    ----------
-    
-    x_train -- x values of training instances
-    y_train -- y values of training instances
-    x_test -- x values of test instances
-    class_value -- class label for which the probabilities are counted for.
-    model_type -- type of model to be fitted.
-    
-    Returns:
-    --------
-    (1) Trained predictive model
-    (2) Probabilities for given test inputs for given class.
-    '''
-
-    if model_type is None or model_type in ["logistic_regression", "lr"]:
-        # Instantiate the model (using the default parameters)
-        logreg = LogisticRegression(solver='lbfgs')
-
-        # Check shape and fit the model.
-        if x_train.ndim == 1:
-            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
-        else:
-            logreg = logreg.fit(x_train, y_train)
-
-        label_probs_logreg = getProbabilityForClass(x_test, logreg,
-                                                    class_value)
-
-        return logreg, label_probs_logreg
-
-    elif model_type in ["random_forest", "rf"]:
-        # Instantiate the model
-        forest = RandomForestClassifier(n_estimators=100, max_depth=3)
-
-        # Check shape and fit the model.
-        if x_train.ndim == 1:
-            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
-        else:
-            forest = forest.fit(x_train, y_train)
-
-        label_probs_forest = getProbabilityForClass(x_test, forest,
-                                                    class_value)
-
-        return forest, label_probs_forest
-
-    elif model_type == "fully_random":
-
-        label_probs = np.ones_like(x_test) / 2
-
-        model_object = lambda x: 0.5
-
-        return model_object, label_probs
-    else:
-        raise ValueError("Invalid model_type!", model_type)
-
-
-def getProbabilityForClass(x, model, class_value):
-    '''
-    Function (wrapper) for obtaining the probability of a class given x and a 
-    predictive model.
-
-    Arguments:
-    -----------
-    x -- individual features, an array of shape (observations, features)
-    model -- a trained sklearn model. Predicts probabilities for given x. 
-        Should accept input of shape (observations, features)
-    class_value -- the resulting class to predict (usually 0 or 1).
-
-    Returns:
-    --------
-    (1) The probabilities of given class label for each x.
-    '''
-    if x.ndim == 1:
-        # if x is vector, transform to column matrix.
-        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
-    else:
-        f_values = model.predict_proba(x)
-
-    # Get correct column of predicted class, remove extra dimensions and return.
-    return f_values[:, model.classes_ == class_value].flatten()
-
-
-def cdf(x_0, model, class_value):
-    '''
-    Cumulative distribution function as described above. Integral is 
-    approximated using Simpson's rule for efficiency.
-    
-    Arguments:
-    ----------
-    
-    x_0 -- private features of an instance for which the value of cdf is to be
-        calculated.
-    model -- a trained sklearn model. Predicts probabilities for given x. 
-        Should accept input of shape (observations, features)
-    class_value -- the resulting class to predict (usually 0 or 1).
-
-    '''
-
-    def prediction(x):
-        return getProbabilityForClass(
-            np.array([x]).reshape(-1, 1), model, class_value)
-
-    prediction_x_0 = prediction(x_0)
-
-    x_values = np.linspace(-15, 15, 40000)
-
-    x_preds = prediction(x_values)
-
-    y_values = scs.norm.pdf(x_values)
-
-    results = np.zeros(x_0.shape[0])
-
-    for i in range(x_0.shape[0]):
-
-        y_copy = y_values.copy()
-
-        y_copy[x_preds > prediction_x_0[i]] = 0
-
-        results[i] = si.simps(y_copy, x=x_values)
-
-    return results
-
-# ### Contraction algorithm
-# 
-# Below is an implementation of Lakkaraju's team's algorithm presented in 
-# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
-# parameters to be passed to the function are presented in the description.
-
-def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
-                accRateR_col, r):
-    '''
-    This is an implementation of the algorithm presented by Lakkaraju
-    et al. in their paper "The Selective Labels Problem: Evaluating 
-    Algorithmic Predictions in the Presence of Unobservables" (2017).
-
-    Arguments:
-    ----------
-    df -- The (Pandas) data frame containing the data, judge decisions,
-        judge IDs, results and probability scores.
-    judgeIDJ_col -- String, the name of the column containing the judges' IDs
-        in df.
-    decisionT_col -- String, the name of the column containing the judges' decisions
-    resultY_col -- String, the name of the column containing the realization
-    modelProbS_col -- String, the name of the column containing the probability
-        scores from the black-box model B.
-    accRateR_col -- String, the name of the column containing the judges' 
-        acceptance rates
-    r -- Float between 0 and 1, the given acceptance rate.
-
-    Returns:
-    --------
-    (1) The estimated failure rate at acceptance rate r.
-    '''
-    # Get ID of the most lenient judge.
-    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]
-
-    # Subset. "D_q is the set of all observations judged by q."
-    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
-
-    # All observations of R_q have observed outcome labels.
-    # "R_q is the set of observations in D_q with observed outcome labels."
-    R_q = D_q[D_q[decisionT_col] == 1].copy()
-
-    # Sort observations in R_q in descending order of confidence scores S and
-    # assign to R_sort_q.
-    # "Observations deemed as high risk by B are at the top of this list"
-    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)
-
-    number_to_remove = int(
-        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))
-
-    # "R_B is the list of observations assigned to t = 1 by B"
-    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]
-
-    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]
-
-
-# ### Evaluators
-
-def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r):
-
-    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(df.shape[0] * r))
-    
-    failed = df[resultY_col][0:to_release] == 0
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-def labeledOutcomesEvaluator(df,
-                             featureX_col,
-                             decisionT_col,
-                             resultY_col,
-                             r,
-                             adjusted=False):
-
-    df_observed = df.loc[df[decisionT_col] == 1, :]
-
-    df_observed = df_observed.sort_values(by='B_prob_0_model',
-                                              inplace=False,
-                                              ascending=True)
-
-    to_release = int(round(df_observed.shape[0] * r))
-    
-    failed = df_observed[resultY_col][0:to_release] == 0
-
-    if adjusted:
-        return np.mean(failed), df.shape[0]
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
-                             accRateR_col, r):
-
-    # Get judges with correct leniency as list
-    is_correct_leniency = df[accRateR_col].round(1) == r
-
-    # No judges with correct leniency
-    if np.sum(is_correct_leniency) == 0:
-        return np.nan, np.nan
-
-    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]
-
-    # Released are the people they judged and released, T = 1
-    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
-                  & (df[decisionT_col] == 1)]
-
-    failed = released[resultY_col] == 0
-    
-    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
-    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]
-
-
-def monteCarloEvaluator(df,
-                        featureX_col,
-                        decisionT_col,
-                        resultY_col,
-                        accRateR_col,
-                        r,
-                        mu_X=0,
-                        mu_Z=0,
-                        beta_X=1,
-                        beta_Z=1,
-                        sigma_X=1,
-                        sigma_Z=1):
-
-    # Compute the predicted/assumed decision bounds for all the judges.
-    q_r = inverseCumulative(x=df[accRateR_col],
-                             mu=mu_X + mu_Z,
-                             sigma=np.sqrt((beta_X * sigma_X)**2 +
-                                           (beta_Z * sigma_Z)**2))
-
-    df = df.assign(bounds=logit(q_r) - df[featureX_col])
-
-    # Compute the expectation of Z when it is known to come from truncated
-    # Gaussian.
-    alphabeta = (df.bounds - mu_Z) / (sigma_Z)
-
-    Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z)  # 1 - cdf(ab)
-
-    # E(Z | Z > a). Expectation of Z if negative decision.
-    exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_
-
-    # E(Z | Z < b). Expectation of Z if positive decision.
-    exp_upper_trunc = mu_Z - (
-        sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta)
-
-    exp_Z = (1 - df[decisionT_col]
-             ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc
-
-    # Attach the predicted probability for Y=0 to data.
-    df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z))
-
-    # Predictions drawn from binomial.
-    predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0])
-
-    df[resultY_col] = np.where(df[decisionT_col] == 0, predictions,
-                                 df[resultY_col])
-
-    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    to_release = int(round(df.shape[0] * r))
-    
-    failed = df[resultY_col][0:to_release] == 0
-
-    return np.sum(failed) / df.shape[0], df.shape[0]
-
-
-
-def perfComp(dgModule, deciderModule, title, save_name):
-    failure_rates = np.zeros((8, 7))
-    error_Ns = np.zeros((8, 7))
-
-    # Create data
-    df = dgModule()
-
-    # Decicions
-    df_labeled, df_unlabeled = deciderModule(df)
-
-    # Split data
-    train, test_labeled = train_test_split(df_labeled, test_size=0.5)
-    
-    # Assign same observations to unlabeled dat
-    test_unlabeled = df_unlabeled.iloc[test_labeled.index.values]
-
-    # Train model
-    B_model, predictions = fitPredictiveModel(
-        train.loc[train['decision_T'] == 1, 'X'],
-        train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], 0, model_type=model_type)
-
-    # Attach predictions to data
-    test_labeled = test_labeled.assign(B_prob_0_model=predictions)
-    test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
-
-    test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-
-    kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
-
-    # Find observed values
-    observed = test_labeled['decision_T'] == 1
-
-    # Assign data to the model
-    dat = dict(D=1,
-               N_obs=np.sum(observed),
-               N_cens=np.sum(~observed),
-               K=group_amount,
-               sigma_tau=sigma_tau,
-               M=len(set(df_unlabeled['judgeID_J'])),
-               jj_obs=test_labeled.loc[observed, 'judgeID_J']+1,
-               jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1,
-               kk_obs=kk_array[observed]+1,
-               kk_cens=kk_array[~observed]+1,
-               dec_obs=test_labeled.loc[observed, 'decision_T'],
-               dec_cens=test_labeled.loc[~observed, 'decision_T'],
-               X_obs=test_labeled.loc[observed, 'B_prob_0_model'].values.reshape(-1,1),
-               X_cens=test_labeled.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1),
-               y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
-
-    fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9))
-
-    pars = fit.extract()
-
-    plt.figure(figsize=(15,30))
-
-    fit.plot();
-
-    plt.savefig(save_name + '_stan_diagnostic_plot')
-    
-    plt.show()
-    plt.close('all')
-
-    print(fit,  file=open(save_name + '_stan_fit_diagnostics.txt', 'w'))
-
-    # Bayes
-    
-    # Alusta matriisi, rivillä yksi otos posteriorista
-    # sarakkeet havaintoja
-    y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
-    
-    # Täydennetään havaitsemattomat estimoiduilla
-    y_imp[:, ~observed] = 1 - pars['y_est']
-    
-    # Täydennetään havaitut havaituilla
-    y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y']
-
-    Rs = np.arange(.1, .9, .1)
-    
-    to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int)
-        
-    for i in range(len(to_release_list)):
-        
-        failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1)
-        
-        est_failure_rates =  failed / test_labeled.shape[0]
-                
-        failure_rates[i, 6] = np.mean(est_failure_rates)
-        
-        error_Ns[i, 6] = test_labeled.shape[0]
-                    
-    for r in range(1, 9):
-
-        print(".", end="")
-
-        # True evaluation
-
-        FR, N = trueEvaluationEvaluator(test_unlabeled, 'X', 'decision_T', 
-                                        'result_Y', r / 10)
-        
-        failure_rates[r - 1, 0] = FR
-        error_Ns[r - 1, 0] = N
-
-        # Labeled outcomes only
-
-        FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T', 
-                                         'result_Y', r / 10)
-
-        failure_rates[r - 1, 1] = FR
-        error_Ns[r - 1, 1] = N
-
-        # Adjusted labeled outcomes
-
-        FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T',
-                                         'result_Y', r / 10, adjusted=True)
-
-        failure_rates[r - 1, 2] = FR
-        error_Ns[r - 1, 2] = N
-
-        # Human evaluation
-
-        FR, N = humanEvaluationEvaluator(test_labeled, 'judgeID_J', 
-                                         'decision_T', 'result_Y', 
-                                         'acceptanceRate_R', r / 10)
-
-        failure_rates[r - 1, 3] = FR
-        error_Ns[r - 1, 3] = N
-
-        # Contraction
-
-        FR, N = contraction(test_labeled, 'judgeID_J', 'decision_T',  
-                            'result_Y', 'B_prob_0_model', 'acceptanceRate_R', r / 10)
-
-        failure_rates[r - 1, 4] = FR
-        error_Ns[r - 1, 4] = N
-
-        # Causal model - analytic solution
-
-        FR, N = monteCarloEvaluator(test_labeled, 'X', 'decision_T', 
-                                    'result_Y', 'acceptanceRate_R', r / 10)
-
-        failure_rates[r - 1, 5] = FR
-        error_Ns[r - 1, 5] = N
-        
-    failure_SEs = standardError(failure_rates, error_Ns)
-
-    x_ax = np.arange(0.1, 0.9, 0.1)
-
-    labels = [
-        'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.',
-        'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes'
-    ]
-    colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c']
-
-    for i in range(failure_rates.shape[1]):
-        plt.errorbar(x_ax,
-                     failure_rates[:, i],
-                     label=labels[i],
-                     c=colours[i],
-                     yerr=failure_SEs[:, i])
-
-    plt.title('Failure rate vs. Acceptance rate')
-    plt.xlabel('Acceptance rate')
-    plt.ylabel('Failure rate')
-    plt.legend()
-    plt.grid()
-    
-    plt.savefig(save_name + '_all')
-    
-    plt.show()
-
-    print("\nFailure rates:")
-    print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
-    
-    print("\nMean absolute errors:")
-    for i in range(1, failure_rates.shape[1]):
-        print(
-            labels[i].ljust(len(max(labels, key=len))),
-            np.round(
-                np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
-
-
-sm = pystan.StanModel(file=stan_code_file_name)
-
-if which == 1:
-    print("\nWithout unobservables (Bernoulli + independent decisions)")
-
-    dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim)
-
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations\n" +
-        "Bernoulli + independent decisions, without unobservables",
-        figure_path + "sl_bernoulli_independent_without_Z"
-    )
-
-gc.collect()
-plt.close('all')
-
-print("\nWith unobservables in the data")
-
-if which == 2:
-    print("\nBernoulli + independent decisions")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Bernoulli + independent decisions, with unobservables",
-        figure_path + "sl_bernoulli_independent_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 3:
-    print("\nThreshold rule + independent decisions")
-
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: quantileDecider(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Threshold rule + independent decisions, with unobservables",
-        figure_path + "sl_threshold_independent_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 4:
-    print("\nBernoulli + non-independent (batch) decisions")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Bernoulli + non-independent decisions, with unobservables",
-        figure_path + "sl_bernoulli_batch_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 5:
-    print("\nThreshold rule + non-independent (batch) decisions")
-
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Fluctuation of failure rate estimates across iterations \n" +
-        "Threshold rule + non-independent decisions, with unobservables",
-        figure_path + "sl_threshold_batch_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 6:
-    print("\nRandom decider")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: randomDecider(
-        x, nJudges_M=M_sim, use_acceptance_rates=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Bernoulli + random decider with leniency and unobservables",
-        figure_path + "sl_random_decider_with_Z",
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 7:
-    print("\nBiased decider")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Bernoulli + biased decider with leniency and unobservables",
-        figure_path + "sl_biased_decider_with_Z",
-    )
-
-
-if which == 8:
-    print("\nBad judge")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Bernoulli + 'bad' decider with leniency and unobservables",
-        figure_path + "sl_bad_decider_with_Z"
-    )
-
-gc.collect()
-plt.close('all')
-
-if which == 9:
-    print("\nBernoulli + Bernoulli")
-
-    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)
-
-    decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Bernoulli + Bernoulli",
-        figure_path + "sl_bernoulli_bernoulli_with_Z",
-    )
-    
-if which == 10:
-    print("\nBeta_Z = 3, Threshold + batch")
-
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0)
-
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Beta_Z = 3, threshold + batch",
-        figure_path + "sl_threshold_batch_beta_Z_3_with_Z",
-    )
-    
-if which == 11:
-    print("\nBeta_Z = 5, Threshold + batch")
-
-    dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0)
-
-    decider = lambda x: humanDeciderLakkaraju(
-        x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True)
-
-    perfComp(
-        dg, lambda x: decider(x),
-        "Beta_Z = 5, threshold + batch",
-        figure_path + "sl_threshold_batch_beta_Z_5_with_Z",
-    )
\ No newline at end of file
-- 
GitLab