diff --git a/analysis_and_scripts/stan_modelling_empirical.py b/analysis_and_scripts/stan_modelling_empirical.py
index 5423a56ab0c6b9bfcc957292e95233753d677f71..2b62c3cd7d598619e458d5d135be8af0a3d2e3ee 100644
--- a/analysis_and_scripts/stan_modelling_empirical.py
+++ b/analysis_and_scripts/stan_modelling_empirical.py
@@ -1,19 +1,19 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-# Author: Riku Laine
-# Date: 12AUG2019
-# Project name: Potential outcomes in model evaluation
-# Description: This script creates the figures and results used 
-#              in empirical data experiments with COMPAS data.
-#
-# Parameters:
-# -----------
-# (1) save_name : file path and prefix for saving the created figures.
-# (2) group_amount : How many groups if the non-linear model is used.
-# (3) stan_code_file_name : Name of file containing the stan model code.
-# (4) sigma_tau : Values of prior variance for the Jung-inspired model.
-# (5) data_path : Path to data file.
+Author: Riku Laine
+Date: 12AUG2019 / Modified 30DEC2019
+Project name: Potential outcomes in model evaluation
+Description: This script creates the figures and results used
+             in empirical data experiments with COMPAS data.
+
+Parameters:
+-----------
+(1) save_name : file path and prefix for saving the created figures.
+(2) stan_code_file_name : Name of file containing the stan model code.
+(3) sigma_tau : Values of prior variance for the Jung-inspired model.
+(4) data_path : Path to data file.
+(5) nJudges_M : Number of judges in simulations.
 """
 
 import numpy as np
@@ -35,31 +35,33 @@ plt.rcParams.update({'figure.figsize': (10, 6)})
 import sys
 
 # Seed for reproducibility
-npr.seed(123)
+npr.seed(111)
 
 # figure storage name
-save_name = sys.argv[1] + "sl_compas"
-
-# How many groups if jung model is used
-group_amount = int(sys.argv[2])
+save_name = sys.argv[1] + "sl_compas_"
 
 # Name of stan model code file
-stan_code_file_name = sys.argv[3]
+stan_code_file_name = sys.argv[2]
 
 # Variance prior
-sigma_tau = float(sys.argv[4])
+sigma_tau = float(sys.argv[3])
 
 # figure storage name
-data_path = sys.argv[5]
+data_path = sys.argv[4]
+
+nJudges_M = int(sys.argv[5])
 
 # Prefix for the figures and log files
 
 print("These results have been obtained with the following settings:")
 
-print("Number of groups:", group_amount)
-
 print("Prior for the variances:", sigma_tau)
 
+def standardize(x):
+    '''Standardization function'''
+
+    return (x - np.mean(x)) / np.std(x, ddof=1)
+
 ##########################
 
 # ## Evaluator modules
@@ -68,19 +70,19 @@ print("Prior for the variances:", sigma_tau)
 
 def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
     '''
-    Fit a predictive model (default logistic regression) with given training 
-    instances and return probabilities for test instances to obtain a given 
+    Fit a predictive model (default logistic regression) with given training
+    instances and return probabilities for test instances to obtain a given
     class label.
-    
+
     Arguments:
     ----------
-    
+
     x_train -- x values of training instances
     y_train -- y values of training instances
     x_test -- x values of test instances
     class_value -- class label for which the probabilities are counted for.
     model_type -- type of model to be fitted.
-    
+
     Returns:
     --------
     (1) Trained predictive model
@@ -130,13 +132,13 @@ def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
 
 def getProbabilityForClass(x, model, class_value):
     '''
-    Function (wrapper) for obtaining the probability of a class given x and a 
+    Function (wrapper) for obtaining the probability of a class given x and a
     predictive model.
 
     Arguments:
     -----------
     x -- individual features, an array of shape (observations, features)
-    model -- a trained sklearn model. Predicts probabilities for given x. 
+    model -- a trained sklearn model. Predicts probabilities for given x.
         Should accept input of shape (observations, features)
     class_value -- the resulting class to predict (usually 0 or 1).
 
@@ -155,8 +157,8 @@ def getProbabilityForClass(x, model, class_value):
 
 
 # ### Contraction algorithm
-# 
-# Below is an implementation of Lakkaraju's team's algorithm presented in 
+#
+# Below is an implementation of Lakkaraju's team's algorithm presented in
 # [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
 # parameters to be passed to the function are presented in the description.
 
@@ -164,7 +166,7 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
                 accRateR_col, r):
     '''
     This is an implementation of the algorithm presented by Lakkaraju
-    et al. in their paper "The Selective Labels Problem: Evaluating 
+    et al. in their paper "The Selective Labels Problem: Evaluating
     Algorithmic Predictions in the Presence of Unobservables" (2017).
 
     Arguments:
@@ -177,7 +179,7 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
     resultY_col -- String, the name of the column containing the realization
     modelProbS_col -- String, the name of the column containing the probability
         scores from the black-box model B.
-    accRateR_col -- String, the name of the column containing the judges' 
+    accRateR_col -- String, the name of the column containing the judges'
         acceptance rates
     r -- Float between 0 and 1, the given acceptance rate.
 
@@ -192,25 +194,28 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
     D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
 
     ##### Agreement rate computation begins #######
-    
-    max_leniency = max(df[accRateR_col])
-    
+
+    Q_leniency = max(df[accRateR_col])
+
+    if Q_leniency < r:
+        return np.nan, np.nan
+
     # Sort D_q
     # "Observations deemed as high risk by B are at the top of this list"
     D_sort_q = D_q.sort_values(by=modelProbS_col, ascending=False)
-    
+
     # The agreement rate is now computed as the percentage of all the positive
     # decisions given by q in the 1-r % of the most dangerous subjects.
     all_negatives = np.sum(D_sort_q[decisionT_col] == 0)
-    
-    n_most_dangerous = int(round(D_q.shape[0] * (1-max_leniency)))
-    
-    agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous])
+
+    n_most_dangerous = int(round(D_q.shape[0] * (1-Q_leniency)))
+
+    agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous] == 0)
 
     print("The agreement rate of contraction is:", agreed_decisions/all_negatives)
-    
+
     ##### Agreement rate computation ends #######
-    
+
     # All observations of R_q have observed outcome labels.
     # "R_q is the set of observations in D_q with observed outcome labels."
     R_q = D_q[D_q[decisionT_col] == 1].copy()
@@ -236,7 +241,7 @@ def trueEvaluationEvaluator(df, resultY_col, r):
     df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
 
     to_release = int(round(df.shape[0] * r))
-    
+
     failed = df[resultY_col][0:to_release] == 0
 
     return np.sum(failed) / df.shape[0], df.shape[0]
@@ -251,7 +256,7 @@ def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False):
                                               ascending=True)
 
     to_release = int(round(df_observed.shape[0] * r))
-    
+
     failed = df_observed[resultY_col][0:to_release] == 0
 
     if adjusted:
@@ -266,13 +271,12 @@ def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False):
 compas_raw = pd.read_csv(data_path)
 
 # Select columns
-compas = compas_raw[[
-    'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex',
-    'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
-    'two_year_recid', 'c_jail_in', 'c_jail_out'
-]]
+compas = compas_raw[['age', 'c_charge_degree', 'race', 'age_cat', 'score_text',
+                     'sex','priors_count', 'days_b_screening_arrest',
+                     'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in',
+                     'c_jail_out']]
 
-# Subset values, see reasons in ProPublica methodology.
+# Filter values, see ProPublica methodology.
 compas = compas.query('days_b_screening_arrest <= 30 and \
                       days_b_screening_arrest >= -30 and \
                       is_recid != -1 and \
@@ -286,30 +290,39 @@ compas = compas[compas.score_text.notnull()]
 compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1)
 
 # Convert string values to dummies, drop first to keep full rank.
-compas_dummy = pd.get_dummies(
-    compas,
-    columns=['c_charge_degree', 'race', 'age_cat', 'sex'],
-    drop_first=True)
+compas_dummy = pd.get_dummies(compas,
+                              columns=['c_charge_degree', 'race', 'age_cat', 'sex'],
+                              drop_first=True)
 
-compas_dummy.drop(columns=[
-    'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
-    'two_year_recid', 'score_text', 'is_recid'
-],
-    inplace=True)
+compas_dummy.drop(columns=['age', 'days_b_screening_arrest', 'c_jail_in',
+                           'c_jail_out', 'two_year_recid', 'score_text',
+                           'is_recid'], inplace=True)
 
 # Shuffle rows for random judge assignment
 compas_shuffled = compas_dummy.sample(frac=1)
 
-nJudges_M = 10
+# Decision assignment
+
+# Generate leniencies from U(0.1, 0.9) for all 16 judges. Then distribute the
+# observations evenly to all decision-makers and give positive decisions to
+# r% of subjects with the lowest COMPAS score.
+
+# nJudges_M = 16
+# nJudges_M = 12
 
 # Assign judges as evenly as possible
 judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False)
 
-# Assign fixed leniencies from 0.1 to 0.9
-judge_leniency = np.array([0.1, 0.3, 0.5, 0.7, 0.9]*2)
+# Draw leniencies
+# judge_leniency = npr.uniform(0.1, 0.9, nJudges_M)
 
+# Assign leniencies
+judge_leniency = np.repeat([0.1, 0.5, 0.9], int(nJudges_M/3))
+
+# Replicate values for assignment
 judge_leniency = judge_leniency[judge_ID]
 
+# Attach to data
 compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID,
                                          judge_leniency=judge_leniency)
 
@@ -340,20 +353,27 @@ compas_unlabeled = compas_shuffled.copy()
 # Hide unobserved
 compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan
 
-# Choose feature_columns
+# Choose feature_columns for the regression (priors, charge degree, race, age
+# and sex).
 feature_cols = ~compas_labeled.columns.isin(
     ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T'])
 
 feature_cols = compas_labeled.columns[feature_cols]
 
+# Standardize priors count
+
+compas_labeled.priors_count = standardize(compas_labeled.priors_count)
 
 ####### Draw figures ###########
 
 def drawDiagnostics(title, save_name, f_rates, titles):
-    
+    '''
+    Draw boxplot figures. One panel for each method.
+    '''
+
     cols = 2
     rows = np.ceil(len(f_rates) / cols)
-    
+
     plt.figure(figsize=(16, 4.5*rows+1))
 
     ax = plt.subplot(rows, cols, 1)
@@ -383,6 +403,8 @@ def drawDiagnostics(title, save_name, f_rates, titles):
     plt.savefig(save_name + '_diagnostic_plot')
 
     plt.show()
+    plt.close('all')
+
 
 nIter = 10
 
@@ -398,87 +420,89 @@ f_rate_cf = np.zeros((nIter, 8))
 sm = Model(stan_file=stan_code_file_name)
 sm.compile()
 
+# Split the nJudges_M judges to two datasets nIter times.
+
 for i in range(nIter):
-    
-    ids = np.arange(5)
-    selected = ids + npr.choice([0, 5], 5)
-        
+
+    # Split data using judge IDs.
+
+    # Get number of judges
+    id_max = np.max(compas_labeled.judge_ID) + 1
+
+    # Choose random judges by id
+    selected = npr.choice(np.arange(id_max), int(id_max/2), False)
+
     test_labeled = compas_labeled[compas_labeled.judge_ID.isin(selected)]
     train = compas_labeled[~compas_labeled.judge_ID.isin(selected)]
 
     # Renew IDs
-    test_labeled.loc[:, 'judge_ID'] = test_labeled.judge_ID % 5
-    
+    test_labeled.loc[:, 'judge_ID'] = test_labeled.groupby('judge_ID').ngroup()
+
     # Assign same observations to unlabeled data
     test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values]
-    
-    
+
     # Train a logistic regression model
     B_model, predictions = fitPredictiveModel(
         train.loc[train['decision_T'] == 1, feature_cols],
         train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0)
-    
+
     # Attach predictions to data
     test_labeled = test_labeled.assign(B_prob_0_model=predictions)
     test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
-    
+
     test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
-    
-    kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
-    
+
     # Find observed values
     observed = test_labeled['decision_T'] == 1
-    
+
     # Assign data to the model
     dat = dict(D=int(10),
                N_obs=int(np.sum(observed)),
                N_cens=int(np.sum(~observed)),
-               K=int(group_amount),
                sigma_tau=sigma_tau,
                M=int(len(set(test_labeled['judge_ID']))),
                jj_obs=test_labeled.loc[observed, 'judge_ID']+1,
                jj_cens=test_labeled.loc[~observed, 'judge_ID']+1,
-               kk_obs=kk_array[observed]+1,
-               kk_cens=kk_array[~observed]+1,
                dec_obs=test_labeled.loc[observed, 'decision_T'],
                dec_cens=test_labeled.loc[~observed, 'decision_T'],
-               X_obs=test_labeled.loc[observed, feature_cols.values],
-               X_cens=test_labeled.loc[~observed, feature_cols.values],
+               X_obs=test_labeled.loc[observed, feature_cols.values].values,
+               X_cens=test_labeled.loc[~observed, feature_cols.values].values,
                y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))
-    
-    # Do the sampling    
-    fit = sm.sample(data=dat, chains=4, seed=123, warmup_iters=1500, 
-                        sampling_iters=4500, adapt_delta=0.975)
+
+
+    # Do the sampling
+    fit = sm.sample(data=dat, chains=4, seed=123, warmup_iters=1500,
+                        sampling_iters=4000, adapt_delta=0.975)
 
     # Extract predictions
     fit_colnames = fit.column_names
 
-    print("Colnames", fit.column_names)
-
     pred_cols = [s for s in fit_colnames if 'y_est' in s]
-   
+
     pars = fit.get_drawset(params=pred_cols)
-    
+
     fit.get_drawset(params=pred_cols)
 
-    print("Drawset:", fit.get_drawset())    
-    
     plt.close('all')
     gc.collect()
     gc.collect()
 
     print(fit.diagnose(), file=open(save_name + '_stan_fit_diagnostics_' +
           str(i) + '.txt', 'w'))
-    
+
+    # Draw figures for every 5th iteration
+    if i % 5 == 0:
+        # Save fit object to csv
+        fit.save_csvfiles(dir=save_name, basename='fit_obj_i_' + str(i))
     # Bayes
-    
+
     # Format matrix, each row is a sample of the posterior
     # columns are observations
-            
+
     #y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
     y_imp = np.ones((pars.shape[0], test_labeled.shape[0]))
 
-    # Impute the unobserved with the estimated 
+    # Impute the unobserved with the estimated
     # Reverse the binary coding to compute failures as sum. 0 == 0 = 1
     #y_imp[:, ~observed] = 1 - pars['y_est']
     y_imp[:, ~observed] = 1 - pars
@@ -491,53 +515,53 @@ for i in range(nIter):
 
     gc.collect()
     gc.collect()
-    
+
     process = psutil.Process(os.getpid())
-    print("Memory use after deletion:", process.memory_info().rss)  # in bytes 
-    
+    print("Memory use after deletion:", process.memory_info().rss)  # in bytes
+
     # Impute the observed
     y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y']
 
-   
+
     for r in range(1, 9):
-        
+
         to_release = np.round(test_labeled.shape[0] * (r / 10)).astype(int)
-                        
+
         failed = np.sum(y_imp[:, 0:to_release], axis=1)
-            
+
         f_rate_cf[i, r - 1] = np.mean(failed / test_labeled.shape[0])
-    
+
         print(".", end="")
-    
+
         # True evaluation
-    
-        f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled, 
+
+        f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled,
                    'result_Y', r / 10)
-            
+
         # Labeled outcomes only
-    
-        f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled, 
+
+        f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled,
                     'decision_T', 'result_Y', r / 10)
-    
+
         # Contraction
-    
-        f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judge_ID', 
-                   'decision_T', 'result_Y', 'B_prob_0_model', 
+
+        f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judge_ID',
+                   'decision_T', 'result_Y', 'B_prob_0_model',
                    'judge_leniency', r / 10)
-    
-failure_rates[:, 0] = np.mean(f_rate_true, axis=0)
-failure_rates[:, 1] = np.mean(f_rate_label, axis=0)
-failure_rates[:, 2] = np.mean(f_rate_cont, axis=0)
-failure_rates[:, 3] = np.mean(f_rate_cf, axis=0)
 
-failure_stds[:, 0] = np.std(f_rate_true, axis=0)
-failure_stds[:, 1] = np.std(f_rate_label, axis=0)
-failure_stds[:, 2] = np.std(f_rate_cont, axis=0)
-failure_stds[:, 3] = np.std(f_rate_cf, axis=0)
+failure_rates[:, 0] = np.nanmean(f_rate_true, axis=0)
+failure_rates[:, 1] = np.nanmean(f_rate_label, axis=0)
+failure_rates[:, 2] = np.nanmean(f_rate_cont, axis=0)
+failure_rates[:, 3] = np.nanmean(f_rate_cf, axis=0)
+
+failure_stds[:, 0] = np.nanstd(f_rate_true, axis=0, ddof=1)
+failure_stds[:, 1] = np.nanstd(f_rate_label, axis=0, ddof=1)
+failure_stds[:, 2] = np.nanstd(f_rate_cont, axis=0, ddof=1)
+failure_stds[:, 3] = np.nanstd(f_rate_cf, axis=0, ddof=1)
 
 x_ax = np.arange(0.1, 0.9, 0.1)
 
-labels = ['True Evaluation', 'Labeled outcomes', 'Contraction', 
+labels = ['True Evaluation', 'Labeled outcomes', 'Contraction',
           'Counterfactuals']
 
 colours = ['g', 'magenta', 'b', 'r']
@@ -562,12 +586,14 @@ plt.savefig(save_name + '_all')
 
 plt.show()
 
+plt.close('all')
+
 # Plot error of the methods (contraction and counterfactual)
 # w.r.t True evaluation
 for i in range(2, failure_rates.shape[1]):
 
     error = np.abs(failure_rates[:, 0] - failure_rates[:, i])
-    
+
     plt.errorbar(x_ax,
                  error,
                  label=labels[i],
@@ -575,6 +601,7 @@ for i in range(2, failure_rates.shape[1]):
                  linestyle=line_styles[i],
                  yerr=failure_stds[:, i])
 
+plt.axhline(y=0, c='k')
 plt.xlabel('Acceptance rate')
 plt.ylabel('Error w.r.t. True evaluation')
 plt.legend()
@@ -584,12 +611,22 @@ plt.savefig(save_name + '_all_err')
 
 plt.show()
 
+plt.close('all')
+
+print("\nSingle runs:")
+print(f_rate_true, f_rate_label, f_rate_cont, f_rate_cf)
+
 print("\nFailure rates:")
 print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
 
 print("\nStandard deviations:")
 print(np.array2string(failure_stds, formatter={'float_kind':lambda x: "%.5f" % x}))
 
+# Save arrays
+np.save(save_name + "_true_FRs", f_rate_true)
+np.save(save_name + "_labeled_FRs", f_rate_label)
+np.save(save_name + "_contraction_FRs", f_rate_cont)
+np.save(save_name + "_counterfactuals_FRs", f_rate_cf)
 
 print("\nMean absolute errors:")
 for i in range(1, failure_rates.shape[1]):
@@ -601,4 +638,4 @@ for i in range(1, failure_rates.shape[1]):
 drawDiagnostics(title="Variation of failure rate estimates by method\nCOMPAS data set",
                 save_name=save_name,
                 f_rates=[f_rate_true, f_rate_label, f_rate_cont, f_rate_cf],
-                titles=labels)
\ No newline at end of file
+                titles=labels)