diff --git a/analysis_and_scripts/code_jung.stan b/analysis_and_scripts/code_jung.stan new file mode 100644 index 0000000000000000000000000000000000000000..8ee246647553fdac0c131969cccc86e86c09a732 --- /dev/null +++ b/analysis_and_scripts/code_jung.stan @@ -0,0 +1,91 @@ +data { + int<lower=1> D; // Dimensions + int<lower=0> N_obs; // Number of "observed observations" (with T = 1) + int<lower=0> N_cens; // Number of "censored observations" (with T = 0) + int<lower=1> M; // Number of judges + int<lower=1> K; // Number of groups + real<lower=0> sigma_tau; + int<lower=1, upper=M> jj_obs[N_obs]; // judge_ID + int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID + int<lower=1, upper=K> kk_obs[N_obs]; // Grouping + int<lower=1, upper=K> kk_cens[N_cens]; // Grouping + int<lower=0, upper=1> dec_obs[N_obs]; // Positive decisions + int<lower=0, upper=1> dec_cens[N_cens]; // Negative decisions + row_vector[D] X_obs[N_obs]; // Private features of "observed observations" (with T = 1) + row_vector[D] X_cens[N_cens]; // Private features of "censored observations" (with T = 1) + int<lower=0, upper=1> y_obs[N_obs]; // Observed outcomes +} + +parameters { + vector[N_obs] Z_obs; + vector[N_cens] Z_cens; + real alpha_T[M]; // Judge-specific intercepts + + vector[D] beta_XT[K]; + vector[D] beta_XY[K]; + + vector<lower=0>[K] beta_ZT_raw; // Coefficient for the latent variable. + vector<lower=0>[K] beta_ZY_raw; // Coefficient for the latent variable. + + real<lower=0> tau_XT; + real<lower=0> tau_XY; + real<lower=0> tau_ZT; + real<lower=0> tau_ZY; +} + +transformed parameters{ + vector<lower=0>[K] beta_ZT_cumulative; + vector<lower=0>[K] beta_ZY_cumulative; + + vector<lower=0>[K] beta_ZT; + vector<lower=0>[K] beta_ZY; + + beta_ZT[1] = beta_ZT_raw[1]; + beta_ZY[1] = beta_ZY_raw[1]; + + if(K >= 2){ + beta_ZT[2:] = tau_ZT * beta_ZT_raw[2:]; + beta_ZY[2:] = tau_ZY * beta_ZY_raw[2:]; + } + + beta_ZT_cumulative = cumulative_sum(beta_ZT); + beta_ZY_cumulative = cumulative_sum(beta_ZY); +} + +model { + Z_obs ~ std_normal(); + Z_cens ~ std_normal(); + + tau_XY ~ normal(0, sigma_tau); + tau_XT ~ normal(0, sigma_tau); + tau_ZY ~ normal(0, sigma_tau); + tau_ZT ~ normal(0, sigma_tau); + + beta_XT[1] ~ std_normal(); // first group + beta_XY[1] ~ std_normal(); + beta_ZT_raw ~ std_normal(); + beta_ZY_raw ~ std_normal(); + + if(K >= 2){ + for(i in 2:K){ // random walk prior here + beta_XT[i] ~ normal(beta_XT[i-1], tau_XT); // ith group + beta_XY[i] ~ normal(beta_XY[i-1], tau_XY); + } + } + + for(i in 1:N_obs){ + dec_obs[i] ~ bernoulli_logit(alpha_T[jj_obs[i]] + X_obs[i] * beta_XT[kk_obs[i]] + beta_ZT_cumulative[kk_obs[i]] * Z_obs[i]); + y_obs[i] ~ bernoulli_logit(X_obs[i] * beta_XY[kk_obs[i]] + beta_ZY_cumulative[kk_obs[i]] * Z_obs[i]); + } + + for(i in 1:N_cens) + dec_cens[i] ~ bernoulli_logit(alpha_T[jj_cens[i]] + X_cens[i] * beta_XT[kk_cens[i]] + beta_ZT_cumulative[kk_cens[i]] * Z_cens[i]); +} + +generated quantities { + real<lower=0, upper=1> y_est[N_cens]; + + for(i in 1:N_cens){ + y_est[i] = inv_logit(X_cens[i] * beta_XY[kk_cens[i]] + beta_ZY_cumulative[kk_cens[i]] * Z_cens[i]); + } +} diff --git a/analysis_and_scripts/stan_modelling_empirical.py b/analysis_and_scripts/stan_modelling_empirical.py new file mode 100644 index 0000000000000000000000000000000000000000..e05c3a7653f0f5f5ae5b57fdae9498687cf2d63b --- /dev/null +++ b/analysis_and_scripts/stan_modelling_empirical.py @@ -0,0 +1,629 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Author: Riku Laine +# Date: 26JUL2019 +# Project name: Potential outcomes in model evaluation +# Description: This script creates the figures and results used +# in empirical data experiments. +# +# Parameters: +# ----------- +# (1) figure_path : file name for saving the created figures. +# (2) nIter : Number of train test splits to perform on the data. +# (3) group_amount : How many groups if Jung-inspired model is used. +# (4) stan_code_file_name : Name of file containing the stan model code. +# (5) sigma_tau : Values of prior variance for the Jung-inspired model. +# (6) data_path : File of compas data. + +""" + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import scipy.stats as scs +import scipy.special as ssp +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +import pystan + +plt.switch_backend('agg') + +import sys + +# figure storage name +figure_path = sys.argv[1] + +# Iterations +nIter = int(sys.argv[2]) + +# How many groups if jung model is used +group_amount = int(sys.argv[3]) + +# Name of stan model code file +stan_code_file_name = sys.argv[4] + +# Variance prior +sigma_tau = float(sys.argv[5]) + +# figure storage name +data_path = sys.argv[6] + +# Prefix for the figures and log files + +print("These results have been obtained with the following settings:") + +print("Number of train test splits :", nIter) + +print("Number of groups:", group_amount) + +print("Prior for the variances:", sigma_tau) + +save_name = "sl_compas" + +def inv_logit(x): + return 1.0 / (1.0 + np.exp(-1.0 * x)) + + +def logit(x): + return np.log(x) - np.log(1.0 - x) + + +def inverse_cumulative(x, mu, sigma): + '''Compute the inverse of the cumulative distribution of logit-normal + distribution at x with parameters mu and sigma (mean and st.dev.).''' + + return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) + +########################## + +# ## Evaluator modules + +# ### Convenience functions + +def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None): + ''' + Fit a predictive model (default logistic regression) with given training + instances and return probabilities for test instances to obtain a given + class label. + + Arguments: + ---------- + + x_train -- x values of training instances + y_train -- y values of training instances + x_test -- x values of test instances + class_value -- class label for which the probabilities are counted for. + model_type -- type of model to be fitted. + + Returns: + -------- + (1) Trained predictive model + (2) Probabilities for given test inputs for given class. + ''' + + if model_type is None or model_type in ["logistic_regression", "lr"]: + # Instantiate the model (using the default parameters) + logreg = LogisticRegression(solver='lbfgs') + + # Check shape and fit the model. + if x_train.ndim == 1: + logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train) + else: + logreg = logreg.fit(x_train, y_train) + + label_probs_logreg = getProbabilityForClass(x_test, logreg, + class_value) + + return logreg, label_probs_logreg + + elif model_type in ["random_forest", "rf"]: + # Instantiate the model + forest = RandomForestClassifier(n_estimators=100, max_depth=3) + + # Check shape and fit the model. + if x_train.ndim == 1: + forest = forest.fit(x_train.values.reshape(-1, 1), y_train) + else: + forest = forest.fit(x_train, y_train) + + label_probs_forest = getProbabilityForClass(x_test, forest, + class_value) + + return forest, label_probs_forest + + elif model_type == "fully_random": + + label_probs = np.ones_like(x_test) / 2 + + model_object = lambda x: 0.5 + + return model_object, label_probs + else: + raise ValueError("Invalid model_type!", model_type) + + +def getProbabilityForClass(x, model, class_value): + ''' + Function (wrapper) for obtaining the probability of a class given x and a + predictive model. + + Arguments: + ----------- + x -- individual features, an array of shape (observations, features) + model -- a trained sklearn model. Predicts probabilities for given x. + Should accept input of shape (observations, features) + class_value -- the resulting class to predict (usually 0 or 1). + + Returns: + -------- + (1) The probabilities of given class label for each x. + ''' + if x.ndim == 1: + # if x is vector, transform to column matrix. + f_values = model.predict_proba(np.array(x).reshape(-1, 1)) + else: + f_values = model.predict_proba(x) + + # Get correct column of predicted class, remove extra dimensions and return. + return f_values[:, model.classes_ == class_value].flatten() + + +# ### Contraction algorithm +# +# Below is an implementation of Lakkaraju's team's algorithm presented in +# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant +# parameters to be passed to the function are presented in the description. + +def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, + accRateR_col, r): + ''' + This is an implementation of the algorithm presented by Lakkaraju + et al. in their paper "The Selective Labels Problem: Evaluating + Algorithmic Predictions in the Presence of Unobservables" (2017). + + Arguments: + ---------- + df -- The (Pandas) data frame containing the data, judge decisions, + judge IDs, results and probability scores. + judgeIDJ_col -- String, the name of the column containing the judges' IDs + in df. + decisionT_col -- String, the name of the column containing the judges' decisions + resultY_col -- String, the name of the column containing the realization + modelProbS_col -- String, the name of the column containing the probability + scores from the black-box model B. + accRateR_col -- String, the name of the column containing the judges' + acceptance rates + r -- Float between 0 and 1, the given acceptance rate. + + Returns: + -------- + (1) The estimated failure rate at acceptance rate r. + ''' + # Get ID of the most lenient judge. + most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()] + + # Subset. "D_q is the set of all observations judged by q." + D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() + + # All observations of R_q have observed outcome labels. + # "R_q is the set of observations in D_q with observed outcome labels." + R_q = D_q[D_q[decisionT_col] == 1].copy() + + # Sort observations in R_q in descending order of confidence scores S and + # assign to R_sort_q. + # "Observations deemed as high risk by B are at the top of this list" + R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False) + + number_to_remove = int( + round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0]))) + + # "R_B is the list of observations assigned to t = 1 by B" + R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]] + + return np.sum(R_B[resultY_col] == 0) / D_q.shape[0] + + +# ### Evaluators + +def contractionEvaluator(df, featureX_col, judgeIDJ_col, decisionT_col, + resultY_col, accRateR_col, r): + + train, test = train_test_split(df, test_size=0.5) + + B_model, predictions = fitPredictiveModel( + train.loc[train[decisionT_col] == 1, featureX_col], + train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], + 0) + + test = test.assign(B_prob_0_model=predictions) + + # Invoke the original contraction. + FR = contraction(test, + judgeIDJ_col=judgeIDJ_col, + decisionT_col=decisionT_col, + resultY_col=resultY_col, + modelProbS_col="B_prob_0_model", + accRateR_col=accRateR_col, + r=r) + + return FR + + +def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r, + fit_model=True): + + train, test = train_test_split(df, test_size=0.5) + + if fit_model: + B_model, predictions = fitPredictiveModel(train[featureX_col], + train[resultY_col], + test[featureX_col], 0) + + test = test.assign(B_prob_0_model=predictions) + + test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) + + to_release = int(round(test.shape[0] * r)) + + return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0] + + +def labeledOutcomesEvaluator(df, + featureX_col, + decisionT_col, + resultY_col, + r, + adjusted=False, + fit_model=True): + + train, test = train_test_split(df, test_size=0.5) + + if fit_model: + B_model, predictions = fitPredictiveModel( + train.loc[train[decisionT_col] == 1, featureX_col], + train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], + 0) + + test = test.assign(B_prob_0_model=predictions) + + test_observed = test.loc[test[decisionT_col] == 1, :] + + test_observed = test_observed.sort_values(by='B_prob_0_model', + inplace=False, + ascending=True) + + to_release = int(round(test_observed.shape[0] * r)) + + if adjusted: + return np.mean(test_observed[resultY_col][0:to_release] == 0) + + return np.sum( + test_observed[resultY_col][0:to_release] == 0) / test.shape[0] + + +def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, + accRateR_col, r): + + # Get judges with correct leniency as list + is_correct_leniency = df[accRateR_col].round(1) == r + + # No judges with correct leniency + if np.sum(is_correct_leniency) == 0: + return np.nan + + correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col] + + # Released are the people they judged and released, T = 1 + released = df[df[judgeIDJ_col].isin(correct_leniency_list) + & (df[decisionT_col] == 1)] + + # Get their failure rate, aka ratio of reoffenders to number of people judged in total + return np.sum(released[resultY_col] == 0) / correct_leniency_list.shape[0] + + +################### + +# Read in the data +compas_raw = pd.read_csv(data_path) + +# Select columns +compas = compas_raw[[ + 'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', + 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', + 'two_year_recid', 'c_jail_in', 'c_jail_out' +]] + +# Subset values, see reasons in ProPublica methodology. +compas = compas.query('days_b_screening_arrest <= 30 and \ + days_b_screening_arrest >= -30 and \ + is_recid != -1 and \ + c_charge_degree != "O"') + +# Drop row if score_text is na +compas = compas[compas.score_text.notnull()] + +# Recode recidivism values to fit earlier notation +# So here result_Y = 1 - is_recid (inverted binary coding). +compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1) + +# Convert string values to dummies, drop first so full rank +compas_dummy = pd.get_dummies( + compas, + columns=['c_charge_degree', 'race', 'age_cat', 'sex'], + drop_first=True) + +compas_dummy.drop(columns=[ + 'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', + 'two_year_recid', 'score_text', 'is_recid' +], + inplace=True) + +# Shuffle rows for random judge assignment +compas_shuffled = compas_dummy.sample(frac=1) + +nJudges_M = 9 + +# Assign judges as evenly as possible +judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False) + +# Assign fixed leniencies from 0.1 to 0.9 +judge_leniency = np.arange(1, 10) / 10 + +judge_leniency = judge_leniency[judge_ID] + +compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID, + judge_leniency=judge_leniency) + +# Sort by judges then probabilities in decreasing order +# Most dangerous for each judge are at the top. +compas_shuffled.sort_values(by=["judge_ID", "decile_score"], + ascending=False, + inplace=True) + +# Iterate over the data. Subject will be given a negative decision +# if they are in the top (1-r)*100% of the individuals the judge will judge. +# I.e. if their within-judge-index is under 1 - acceptance threshold times +# the number of subjects assigned to each judge they will receive a +# negative decision. +compas_shuffled.reset_index(drop=True, inplace=True) + +subjects_allocated = compas_shuffled.judge_ID.value_counts() + +compas_shuffled['judge_index'] = compas_shuffled.groupby('judge_ID').cumcount() + +compas_shuffled['decision_T'] = np.where( + compas_shuffled['judge_index'] < (1 - compas_shuffled['judge_leniency']) * + subjects_allocated[compas_shuffled['judge_ID']].values, 0, 1) + +compas_labeled = compas_shuffled.copy() +compas_unlabeled = compas_shuffled.copy() + +# Hide unobserved +compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan + +# Choose feature_columns +feature_cols = ~compas_labeled.columns.isin( + ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T']) + +feature_cols = compas_labeled.columns[feature_cols] + + +####### Draw figures ########### + +failure_rates = np.zeros((8, 6)) +failure_sems = np.zeros((8, 6)) + +f_rate_true = np.zeros((nIter, 8)) +f_rate_label = np.zeros((nIter, 8)) +f_rate_label_adj = np.zeros((nIter, 8)) +f_rate_human = np.zeros((nIter, 8)) +f_rate_cont = np.zeros((nIter, 8)) +f_rate_caus = np.zeros((nIter, 8)) + + +# Split data +train, test = train_test_split(compas_labeled, test_size=0.5) + +# Train a logistic regression model +B_model, predictions = fitPredictiveModel( + train.loc[train['decision_T'] == 1, feature_cols], + train.loc[train['decision_T'] == 1, 'result_Y'], test[feature_cols], 0) + +test = test.assign(B_prob_0_model=predictions) + +test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) + +kk_array = pd.qcut(test['B_prob_0_model'], group_amount, labels=False) + +# Find observed values +observed = test['decision_T'] == 1 + +# Assign data to the model +dat = dict(D=1, + N_obs=np.sum(observed), + N_cens=np.sum(~observed), + K=group_amount, + sigma_tau=sigma_tau, + M=len(set(compas_labeled['judge_ID'])), + jj_obs=test.loc[observed, 'judge_ID']+1, + jj_cens=test.loc[~observed, 'judge_ID']+1, + kk_obs=kk_array[observed]+1, + kk_cens=kk_array[~observed]+1, + dec_obs=test.loc[observed, 'decision_T'], + dec_cens=test.loc[~observed, 'decision_T'], + X_obs=test.loc[observed, 'B_prob_0_model'].values.reshape(-1,1), + X_cens=test.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1), + y_obs=test.loc[observed, 'result_Y'].astype(int)) + +sm = pystan.StanModel(file=stan_code_file_name) + +fit = sm.sampling(data=dat, chains=5, iter=4000, control = dict(adapt_delta=0.9)) + +pars = fit.extract() + +print(fit, file=open(save_name + '_stan_fit_diagnostics.txt', 'w')) + +plt.figure(figsize=(15,30)) + +fit.plot(); + +plt.savefig(save_name + '_stan_diagnostic_plot') + +plt.show() +plt.close('all') + +# Bayes + +# Alusta matriisi, rivillä yksi otos posteriorista +# sarakkeet havaintoja +y_imp = np.ones((pars['y_est'].shape[0], test.shape[0])) + +# Täydennetään havaitsemattomat estimoiduilla +y_imp[:, ~observed] = 1-pars['y_est'] + +# Täydennetään havaitut havaituilla +y_imp[:, observed] = 1-test.loc[observed, 'result_Y'] + +Rs = np.arange(.1, .9, .1) + +to_release_list = np.round(test.shape[0] * Rs).astype(int) + +f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan) + +for i in range(len(to_release_list)): + est_failure_rates = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) / test.shape[0] + + f_rate_bayes[:, i] = est_failure_rates + + failure_rates[i, 5] = np.mean(est_failure_rates) + +for i in range(nIter): + + print(" [", i, "] ", sep='', end="") + + for r in np.arange(1, 9): + + print(".", end="") + + # True evaluation + + f_rate_true[i, r - 1] = trueEvaluationEvaluator( + compas_unlabeled, feature_cols, 'decision_T', 'result_Y', r / 10) + + # Labeled outcomes only + + f_rate_label[i, r - 1] = labeledOutcomesEvaluator( + compas_labeled, feature_cols, 'decision_T', 'result_Y', r / 10) + + # Adjusted labeled outcomes + + f_rate_label_adj[i, r - 1] = labeledOutcomesEvaluator( + compas_labeled, + feature_cols, + 'decision_T', + 'result_Y', + r / 10, + adjusted=True) + + # Human evaluation + + f_rate_human[i, r - 1] = humanEvaluationEvaluator( + compas_labeled, 'judge_ID', 'decision_T', 'result_Y', + 'judge_leniency', r / 10) + + # Contraction + + f_rate_cont[i, r - 1] = contractionEvaluator( + compas_labeled, feature_cols, 'judge_ID', 'decision_T', 'result_Y', + 'judge_leniency', r / 10) + +failure_rates[:, 0] = np.mean(f_rate_true, axis=0) +failure_rates[:, 1] = np.mean(f_rate_label, axis=0) +failure_rates[:, 2] = np.mean(f_rate_label_adj, axis=0) +failure_rates[:, 3] = np.mean(f_rate_human, axis=0) +failure_rates[:, 4] = np.mean(f_rate_cont, axis=0) + +failure_sems[:, 0] = scs.sem(f_rate_true, axis=0) +failure_sems[:, 1] = scs.sem(f_rate_label, axis=0) +failure_sems[:, 2] = scs.sem(f_rate_label_adj, axis=0) +failure_sems[:, 3] = scs.sem(f_rate_human, axis=0) +failure_sems[:, 4] = scs.sem(f_rate_cont, axis=0) +failure_sems[:, 5] = scs.sem(f_rate_bayes, axis=0, nan_policy='omit') + +x_ax = np.arange(0.1, 0.9, 0.1) + +labels = [ + 'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.', + 'Human evaluation', 'Contraction', 'Potential outcomes' +] +colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c'] + +for i in range(failure_rates.shape[1]): + plt.errorbar(x_ax, + failure_rates[:, i], + label=labels[i], + c=colours[i], + yerr=failure_sems[:, i]) + +plt.title('Failure rate vs. Acceptance rate') +plt.xlabel('Acceptance rate') +plt.ylabel('Failure rate') +plt.legend() +plt.grid() + +plt.savefig(save_name + '_all') + +plt.show() + +print("\nFailure rates:") +print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) + +print("\nMean absolute errors:") +for i in range(1, failure_rates.shape[1]): + print( + labels[i].ljust(len(max(labels, key=len))), + np.round( + np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5)) + + +# Draw diagnostic figures +f_rates= [f_rate_true, f_rate_label, f_rate_label_adj, f_rate_human, + f_rate_cont, f_rate_bayes] + +cols = 2 +rows = np.ceil(len(f_rates) / cols) + +plt.figure(figsize=(16, 4.5*rows+1)) + +ax = plt.subplot(rows, cols, 1) +x_ax = np.arange(1, 9, 1) / 10 + +plt.boxplot(f_rates[0], labels=x_ax) + +plt.title(labels[0]) +plt.xlabel('Acceptance rate') +plt.ylabel('Failure rate') +plt.grid() + +for i in range(len(f_rates)): + plt.subplot(rows, cols, i + 1, sharey=ax) + + plt.boxplot(f_rates[i], labels=x_ax) + + plt.title(labels[i]) + plt.xlabel('Acceptance rate') + plt.ylabel('Failure rate') + plt.grid() + +plt.tight_layout() +plt.subplots_adjust(top=0.89) + +title = "COMPAS data set\nFluctuation of failure rate estimates per method" + +plt.suptitle(title, y=0.96, weight='bold') + +plt.savefig(save_name + '_diagnostic_plot') + +plt.show() \ No newline at end of file diff --git a/analysis_and_scripts/stan_modelling_theoretic.py b/analysis_and_scripts/stan_modelling_theoretic.py new file mode 100644 index 0000000000000000000000000000000000000000..4f3c5a98202f1bab666a3b7021b8e06d89ab5d10 --- /dev/null +++ b/analysis_and_scripts/stan_modelling_theoretic.py @@ -0,0 +1,1267 @@ +''' +# Author: Riku Laine +# Date: 25JUL2019 (start) +# Project name: Potential outcomes in model evaluation +# Description: This script creates the figures and results used +# in synthetic data experiments. +# +# Parameters: +# ----------- +# (1) figure_path : file name for saving the created figures. +# (2) N_sim : Size of simulated data set. +# (3) M_sim : Number of judges in simulated data, +# N_sim must be divisible by M_sim! +# (4) which : Which data + outcome analysis should be performed. +# (5) group_amount : How many groups if Jung-inspired model is used. +# (6) stan_code_file_name : Name of file containing the stan model code. +# (7) sigma_tau : Values of prior variance for the Jung-inspired model. +# +# Modifications: +# -------------- +# 26JUL2019 : RL - Changed add_epsilon default to True +# - Corrected the data set used by the causal model (from +# unlabeled to labeled) +# - Corrections to which == 11, coefficients and prints. +# - All plot, thinning off. +# - Fix one decider with leniency 0.9 +# +''' +# Refer to the `notes.tex` file for explanations about the modular framework. + +# Imports + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import scipy.stats as scs +import scipy.special as ssp +import scipy.integrate as si +import numpy.random as npr +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +import pystan +import gc + +plt.switch_backend('agg') + +import sys + +# figure storage name +figure_path = sys.argv[1] + +# Size of simulated data set +N_sim = int(sys.argv[2]) + +# Number of judges in simulated data, N_sim must be divisible by M_sim! +M_sim = int(sys.argv[3]) + +# Which data + outcome generation should be performed. +which = int(sys.argv[4]) + +# How many groups if jung model is used +group_amount = int(sys.argv[5]) + +# Name of stan model code file +stan_code_file_name = sys.argv[6] + +# Variance prior +sigma_tau = float(sys.argv[7]) + +# Settings + +plt.rcParams.update({'font.size': 16}) +plt.rcParams.update({'figure.figsize': (10, 6)}) + +print("These results have been obtained with the following settings:") + +print("Number of observations in the simulated data:", N_sim) + +print("Number of judges in the simulated data:", M_sim) + +print("Number of groups:", group_amount) + +print("Prior for the variances:", sigma_tau) + +# Basic functions + + +def inv_logit(x): + return 1.0 / (1.0 + np.exp(-1.0 * x)) + + +def logit(x): + return np.log(x) - np.log(1.0 - x) + + +def inverse_cumulative(x, mu, sigma): + '''Compute the inverse of the cumulative distribution of logit-normal + distribution at x with parameters mu and sigma (mean and st.dev.).''' + + return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) + + +# ## Data generation modules + +def bernoulliDGWithoutUnobservables(N_total=50000): + '''Generates data | Variables: X, Y | Outcome from Bernoulli''' + + df = pd.DataFrame() + + # Sample feature X from standard Gaussian distribution, N(0, 1). + df = df.assign(X=npr.normal(size=N_total)) + + # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inv_logit(X) + df = df.assign(probabilities_Y=inv_logit(df.X)) + + # Draw Y ~ Bernoulli(1 - inv_logit(X)) + # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inv_logit(X) + results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total) + + df = df.assign(result_Y=results) + + return df + + +def thresholdDGWithUnobservables(N_total=50000, + beta_X=1.0, + beta_Z=1.0, + beta_W=0.2): + '''Generates data | Variables: X, Z, W, Y | Outcome by threshold''' + + df = pd.DataFrame() + + # Sample the variables from standard Gaussian distributions. + df = df.assign(X=npr.normal(size=N_total)) + df = df.assign(Z=npr.normal(size=N_total)) + df = df.assign(W=npr.normal(size=N_total)) + + # Calculate P(Y=0|X, Z, W) + probabilities_Y = inv_logit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) + + df = df.assign(probabilities_Y=probabilities_Y) + + # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise + df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1)) + + return df + + +def bernoulliDGWithUnobservables(N_total=50000, + beta_X=1.0, + beta_Z=1.0, + beta_W=0.2): + '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli''' + + df = pd.DataFrame() + + # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1). + df = df.assign(X=npr.normal(size=N_total)) + df = df.assign(Z=npr.normal(size=N_total)) + df = df.assign(W=npr.normal(size=N_total)) + + # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inv_logit(X) + probabilities_Y = inv_logit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) + + df = df.assign(probabilities_Y=probabilities_Y) + + # Draw Y from Bernoulli distribution + results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total) + + df = df.assign(result_Y=results) + + return df + + +# ## Decider modules + +def humanDeciderLakkaraju(df, + featureX_col, + featureZ_col=None, + nJudges_M=100, + beta_X=1, + beta_Z=1, + add_epsilon=True): + '''Decider module | Non-independent batch decisions.''' + + # Assert that every judge will have the same number of subjects. + assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" + + # Compute the number of subjects allocated for each judge. + nSubjects_N = int(df.shape[0] / nJudges_M) + + # Assign judge IDs as running numbering from 0 to nJudges_M - 1 + df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) + + # Sample acceptance rates uniformly from a closed interval + # from 0.1 to 0.9 and round to tenth decimal place. + # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all + # values. + acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) + acceptance_rates = np.round(acceptance_rates, 10) + + # Replicate the rates so they can be attached to the corresponding judge ID. + df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) + + if add_epsilon: + epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) + else: + epsilon = 0 + + if featureZ_col is None: + probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon) + else: + probabilities_T = inv_logit(beta_X * df[featureX_col] + + beta_Z * df[featureZ_col] + epsilon) + + + df = df.assign(probabilities_T=probabilities_T) + + # Sort by judges then probabilities in decreasing order + # Most dangerous for each judge are at the top. + df.sort_values(by=["judgeID_J", "probabilities_T"], + ascending=False, + inplace=True) + + # Iterate over the data. Subject will be given a negative decision + # if they are in the top (1-r)*100% of the individuals the judge will judge. + # I.e. if their within-judge-index is under 1 - acceptance threshold times + # the number of subjects assigned to each judge they will receive a + # negative decision. + df.reset_index(drop=True, inplace=True) + + df['decision_T'] = np.where((df.index.values % nSubjects_N) < + ((1 - df['acceptanceRate_R']) * nSubjects_N), + 0, 1) + + df_labeled = df.copy() + + # Hide unobserved + df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan + + return df_labeled, df + + +def bernoulliDecider(df, + featureX_col, + featureZ_col=None, + nJudges_M=100, + beta_X=1, + beta_Z=1, + add_epsilon=True): + '''Use X and Z to make a decision with probability + P(T=0|X, Z)=inv_logit(beta_X*X+beta_Z*Z).''' + + # Assert that every judge will have the same number of subjects. + assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" + + # Compute the number of subjects allocated for each judge. + nSubjects_N = int(df.shape[0] / nJudges_M) + + # Assign judge IDs as running numbering from 0 to nJudges_M - 1 + df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) + + if add_epsilon: + epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) + else: + epsilon = 0 + + if featureZ_col is None: + probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon) + else: + probabilities_T = inv_logit(beta_X * df[featureX_col] + + beta_Z * df[featureZ_col] + epsilon) + + df = df.assign(probabilities_T=probabilities_T) + + # Draw T from Bernoulli distribution + decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0]) + + df = df.assign(decision_T=decisions) + + # Calculate the acceptance rates. + acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values + + # Replicate the rates so they can be attached to the corresponding judge ID. + df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) + + df_labeled = df.copy() + + df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan + + return df_labeled, df + + +def quantileDecider(df, + featureX_col, + featureZ_col=None, + nJudges_M=100, + beta_X=1, + beta_Z=1, + add_epsilon=True): + '''Assign decisions by the value of inverse cumulative distribution function + of the logit-normal distribution at leniency r.''' + + # Assert that every judge will have the same number of subjects. + assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" + + # Compute the number of subjects allocated for each judge. + nSubjects_N = int(df.shape[0] / nJudges_M) + + # Assign judge IDs as running numbering from 0 to nJudges_M - 1 + df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) + + # Sample acceptance rates uniformly from a closed interval + # from 0.1 to 0.9 and round to tenth decimal place. + # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all + # values. + acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) + acceptance_rates = np.round(acceptance_rates, 10) + + # Replicate the rates so they can be attached to the corresponding judge ID. + df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) + + if add_epsilon: + epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) + else: + epsilon = 0 + + if featureZ_col is None: + probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon) + + # Compute the bounds straight from the inverse cumulative. + # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2. + df = df.assign(bounds=inverse_cumulative( + x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2))) + else: + probabilities_T = inv_logit(beta_X * df[featureX_col] + + beta_Z * df[featureZ_col] + epsilon) + + # Compute the bounds straight from the inverse cumulative. + # Assuming X and Z are i.i.d standard Gaussians with variance 1. + # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z). + df = df.assign(bounds=inverse_cumulative( + x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2))) + + df = df.assign(probabilities_T=probabilities_T) + + # Assign negative decision if the predicted probability (probabilities_T) is + # over the judge's threshold (bounds). + df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1)) + + df_labeled = df.copy() + + df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan + + return df_labeled, df + + +def randomDecider(df, nJudges_M=100, use_acceptance_rates=False): + '''Doesn't use any information about X and Z to make decisions. + + If use_acceptance_rates is False (default) then all decisions are positive + with probabiltiy 0.5. If True, probabilities will be sampled from + U(0.1, 0.9) and rounded to tenth decimal place.''' + + # Assert that every judge will have the same number of subjects. + assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" + + # Compute the number of subjects allocated for each judge. + nSubjects_N = int(df.shape[0] / nJudges_M) + + # Assign judge IDs as running numbering from 0 to nJudges_M - 1 + df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) + + if use_acceptance_rates: + # Sample acceptance rates uniformly from a closed interval + # from 0.1 to 0.9 and round to tenth decimal place. + acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10) + else: + # No real leniency here -> set to 0.5. + acceptance_rates = np.ones(nJudges_M) * 0.5 + + # Replicate the rates so they can be attached to the corresponding judge ID. + df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) + + df = df.assign( + decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0])) + + df_labeled = df.copy() + + df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan + + return df_labeled, df + + +def biasDecider(df, + featureX_col, + featureZ_col=None, + nJudges_M=100, + beta_X=1, + beta_Z=1, + add_epsilon=True): + ''' + Biased decider: If X > 1, then X <- X * 0.75. People with high X, + get more positive decisions as they should. + + ''' + + # If X > 1, then X <- X * 0.75. People with high X, get more positive + # decisions as they should + df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] * + 0.75, df[featureX_col])) + + # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less + # positive decisions as they should + df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1, + df.biased_X + 0.5, df.biased_X) + + # Assert that every judge will have the same number of subjects. + assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" + + # Use quantile decider, but judge by the biased X. + df_labeled, df = humanDeciderLakkaraju(df, + featureX_col='biased_X', + featureZ_col=featureZ_col, + nJudges_M=nJudges_M, + beta_X=beta_X, + beta_Z=beta_Z, + add_epsilon=add_epsilon) + + return df_labeled, df + + +# ## Evaluator modules + +# ### Convenience functions + +def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None): + ''' + Fit a predictive model (default logistic regression) with given training + instances and return probabilities for test instances to obtain a given + class label. + + Arguments: + ---------- + + x_train -- x values of training instances + y_train -- y values of training instances + x_test -- x values of test instances + class_value -- class label for which the probabilities are counted for. + model_type -- type of model to be fitted. + + Returns: + -------- + (1) Trained predictive model + (2) Probabilities for given test inputs for given class. + ''' + + if model_type is None or model_type in ["logistic_regression", "lr"]: + # Instantiate the model (using the default parameters) + logreg = LogisticRegression(solver='lbfgs') + + # Check shape and fit the model. + if x_train.ndim == 1: + logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train) + else: + logreg = logreg.fit(x_train, y_train) + + label_probs_logreg = getProbabilityForClass(x_test, logreg, + class_value) + + return logreg, label_probs_logreg + + elif model_type in ["random_forest", "rf"]: + # Instantiate the model + forest = RandomForestClassifier(n_estimators=100, max_depth=3) + + # Check shape and fit the model. + if x_train.ndim == 1: + forest = forest.fit(x_train.values.reshape(-1, 1), y_train) + else: + forest = forest.fit(x_train, y_train) + + label_probs_forest = getProbabilityForClass(x_test, forest, + class_value) + + return forest, label_probs_forest + + elif model_type == "fully_random": + + label_probs = np.ones_like(x_test) / 2 + + model_object = lambda x: 0.5 + + return model_object, label_probs + else: + raise ValueError("Invalid model_type!", model_type) + + +def getProbabilityForClass(x, model, class_value): + ''' + Function (wrapper) for obtaining the probability of a class given x and a + predictive model. + + Arguments: + ----------- + x -- individual features, an array of shape (observations, features) + model -- a trained sklearn model. Predicts probabilities for given x. + Should accept input of shape (observations, features) + class_value -- the resulting class to predict (usually 0 or 1). + + Returns: + -------- + (1) The probabilities of given class label for each x. + ''' + if x.ndim == 1: + # if x is vector, transform to column matrix. + f_values = model.predict_proba(np.array(x).reshape(-1, 1)) + else: + f_values = model.predict_proba(x) + + # Get correct column of predicted class, remove extra dimensions and return. + return f_values[:, model.classes_ == class_value].flatten() + + +def cdf(x_0, model, class_value): + ''' + Cumulative distribution function as described above. Integral is + approximated using Simpson's rule for efficiency. + + Arguments: + ---------- + + x_0 -- private features of an instance for which the value of cdf is to be + calculated. + model -- a trained sklearn model. Predicts probabilities for given x. + Should accept input of shape (observations, features) + class_value -- the resulting class to predict (usually 0 or 1). + + ''' + + def prediction(x): + return getProbabilityForClass( + np.array([x]).reshape(-1, 1), model, class_value) + + prediction_x_0 = prediction(x_0) + + x_values = np.linspace(-15, 15, 40000) + + x_preds = prediction(x_values) + + y_values = scs.norm.pdf(x_values) + + results = np.zeros(x_0.shape[0]) + + for i in range(x_0.shape[0]): + + y_copy = y_values.copy() + + y_copy[x_preds > prediction_x_0[i]] = 0 + + results[i] = si.simps(y_copy, x=x_values) + + return results + + +def bailIndicator(r, y_model, x_train, x_test): + ''' + Indicator function for whether a judge will bail or jail a suspect. + Rationale explained above. + + Algorithm: + ---------- + + (1) Calculate recidivism probabilities from training set with a trained + model and assign them to predictions_train. + + (2) Calculate recidivism probabilities from test set with the trained + model and assign them to predictions_test. + + (3) Construct a quantile function of the probabilities in + in predictions_train. + + (4) + For pred in predictions_test: + + if pred belongs to a percentile (computed from step (3)) lower than r + return True + else + return False + + Arguments: + ---------- + + r -- float, acceptance rate, between 0 and 1 + y_model -- a trained sklearn predictive model to predict the outcome + x_train -- private features of the training instances + x_test -- private features of the test instances + + Returns: + -------- + (1) Boolean list indicating a bail decision (bail = True) for each + instance in x_test. + ''' + + predictions_train = getProbabilityForClass(x_train, y_model, 0) + + predictions_test = getProbabilityForClass(x_test, y_model, 0) + + return [ + scs.percentileofscore(predictions_train, pred, kind='weak') < r + for pred in predictions_test + ] + + +# ### Contraction algorithm +# +# Below is an implementation of Lakkaraju's team's algorithm presented in +# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant +# parameters to be passed to the function are presented in the description. + +def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, + accRateR_col, r): + ''' + This is an implementation of the algorithm presented by Lakkaraju + et al. in their paper "The Selective Labels Problem: Evaluating + Algorithmic Predictions in the Presence of Unobservables" (2017). + + Arguments: + ---------- + df -- The (Pandas) data frame containing the data, judge decisions, + judge IDs, results and probability scores. + judgeIDJ_col -- String, the name of the column containing the judges' IDs + in df. + decisionT_col -- String, the name of the column containing the judges' decisions + resultY_col -- String, the name of the column containing the realization + modelProbS_col -- String, the name of the column containing the probability + scores from the black-box model B. + accRateR_col -- String, the name of the column containing the judges' + acceptance rates + r -- Float between 0 and 1, the given acceptance rate. + + Returns: + -------- + (1) The estimated failure rate at acceptance rate r. + ''' + # Get ID of the most lenient judge. + most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()] + + # Subset. "D_q is the set of all observations judged by q." + D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() + + # All observations of R_q have observed outcome labels. + # "R_q is the set of observations in D_q with observed outcome labels." + R_q = D_q[D_q[decisionT_col] == 1].copy() + + # Sort observations in R_q in descending order of confidence scores S and + # assign to R_sort_q. + # "Observations deemed as high risk by B are at the top of this list" + R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False) + + number_to_remove = int( + round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0]))) + + # "R_B is the list of observations assigned to t = 1 by B" + R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]] + + return np.sum(R_B[resultY_col] == 0) / D_q.shape[0] + + +# ### Evaluators + +def contractionEvaluator(df, featureX_col, judgeIDJ_col, decisionT_col, + resultY_col, accRateR_col, r): + + train, test = train_test_split(df, test_size=0.5) + + B_model, predictions = fitPredictiveModel( + train.loc[train[decisionT_col] == 1, featureX_col], + train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], + 0) + + test = test.assign(B_prob_0_model=predictions) + + # Invoke the original contraction. + FR = contraction(test, + judgeIDJ_col=judgeIDJ_col, + decisionT_col=decisionT_col, + resultY_col=resultY_col, + modelProbS_col="B_prob_0_model", + accRateR_col=accRateR_col, + r=r) + + return FR + + +def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r): + + train, test = train_test_split(df, test_size=0.5) + + B_model, predictions = fitPredictiveModel(train[featureX_col], + train[resultY_col], + test[featureX_col], 0) + + test = test.assign(B_prob_0_model=predictions) + + test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) + + to_release = int(round(test.shape[0] * r)) + + return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0] + + +def labeledOutcomesEvaluator(df, + featureX_col, + decisionT_col, + resultY_col, + r, + adjusted=False): + + train, test = train_test_split(df, test_size=0.5) + + B_model, predictions = fitPredictiveModel( + train.loc[train[decisionT_col] == 1, featureX_col], + train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], + 0) + + test = test.assign(B_prob_0_model=predictions) + + test_observed = test.loc[test[decisionT_col] == 1, :] + + test_observed = test_observed.sort_values(by='B_prob_0_model', + inplace=False, + ascending=True) + + to_release = int(round(test_observed.shape[0] * r)) + + if adjusted: + return np.mean(test_observed[resultY_col][0:to_release] == 0) + + return np.sum( + test_observed[resultY_col][0:to_release] == 0) / test.shape[0] + + +def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, + accRateR_col, r): + + # Get judges with correct leniency as list + is_correct_leniency = df[accRateR_col].round(1) == r + + # No judges with correct leniency + if np.sum(is_correct_leniency) == 0: + return np.nan + + correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col] + + # Released are the people they judged and released, T = 1 + released = df[df[judgeIDJ_col].isin(correct_leniency_list) + & (df[decisionT_col] == 1)] + + # Get their failure rate, aka ratio of reoffenders to number of people judged in total + return np.sum(released[resultY_col] == 0) / correct_leniency_list.shape[0] + + +def causalEvaluator(df, featureX_col, decisionT_col, resultY_col, r): + + train, test = train_test_split(df, test_size=0.5) + + B_model, predictions = fitPredictiveModel( + train.loc[train[decisionT_col] == 1, featureX_col], + train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], + 0) + + test = test.assign(B_prob_0_model=predictions) + + released = cdf(test[featureX_col], B_model, 0) < r + + return np.mean(test.B_prob_0_model * released) + + +def monteCarloEvaluator(df, + featureX_col, + decisionT_col, + resultY_col, + accRateR_col, + r, + mu_X=0, + mu_Z=0, + beta_X=1, + beta_Z=1, + sigma_X=1, + sigma_Z=1): + + # Train the models and assign the predicted probabilities. + train, test = train_test_split(df, test_size=0.5) + + B_model, predictions = fitPredictiveModel( + train.loc[train[decisionT_col] == 1, featureX_col], + train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], + 0) + + test = test.assign(B_prob_0_model=predictions) + + # Compute the predicted/assumed decision bounds for all the judges. + q_r = inverse_cumulative(x=test[accRateR_col], + mu=mu_X + mu_Z, + sigma=np.sqrt((beta_X * sigma_X)**2 + + (beta_Z * sigma_Z)**2)) + + test = test.assign(bounds=logit(q_r) - test[featureX_col]) + + # Compute the expectation of Z when it is known to come from truncated + # Gaussian. + alphabeta = (test.bounds - mu_Z) / (sigma_Z) + + Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z) # 1 - cdf(ab) + + # E(Z | Z > a). Expectation of Z if negative decision. + exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_ + + # E(Z | Z < b). Expectation of Z if positive decision. + exp_upper_trunc = mu_Z - ( + sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta) + + exp_Z = (1 - test[decisionT_col] + ) * exp_lower_trunc + test[decisionT_col] * exp_upper_trunc + + # Attach the predicted probability for Y=0 to data. + test = test.assign(predicted_Y=inv_logit(test[featureX_col] + exp_Z)) + + # Predictions drawn from binomial. (Should fix this.) + predictions = npr.binomial(n=1, p=1 - test.predicted_Y, size=test.shape[0]) + + test[resultY_col] = np.where(test[decisionT_col] == 0, predictions, + test[resultY_col]) + + test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) + + to_release = int(round(test.shape[0] * r)) + + return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0] + + +# ## Performance comparison +# +# Below we try to replicate the results obtained by Lakkaraju and compare +# their model's performance to the one of ours. + +def drawDiagnostics(title, save_name, save, f_rates, titles): + + cols = 2 + rows = np.ceil(len(f_rates) / cols) + + plt.figure(figsize=(16, 4.5*rows+1)) + + ax = plt.subplot(rows, cols, 1) + x_ax = np.arange(1, 9, 1) / 10 + + plt.boxplot(f_rates[0], labels=x_ax) + + plt.title(titles[0]) + plt.xlabel('Acceptance rate') + plt.ylabel('Failure rate') + plt.grid() + + for i in range(len(f_rates)): + plt.subplot(rows, cols, i + 1, sharey=ax) + + plt.boxplot(f_rates[i], labels=x_ax) + + plt.title(titles[i]) + plt.xlabel('Acceptance rate') + plt.ylabel('Failure rate') + plt.grid() + + plt.tight_layout() + plt.subplots_adjust(top=0.89) + plt.suptitle(title, y=0.96, weight='bold') + + if save: + plt.savefig(save_name + '_diagnostic_plot') + + plt.show() + + +def perfComp(dgModule, deciderModule, title, save_name, save=True, nIter=50): + failure_rates = np.zeros((8, 7)) + failure_sems = np.zeros((8, 7)) + + f_rate_true = np.zeros((nIter, 8)) + f_rate_label = np.zeros((nIter, 8)) + f_rate_label_adj = np.zeros((nIter, 8)) + f_rate_human = np.zeros((nIter, 8)) + f_rate_cont = np.zeros((nIter, 8)) + f_rate_caus = np.zeros((nIter, 8)) + + # Create data + df = dgModule() + + # Decicions + df_labeled, df_unlabeled = deciderModule(df) + + # Split data + train, test = train_test_split(df_labeled, test_size=0.5) + + # Train model + B_model, predictions = fitPredictiveModel( + train.loc[train['decision_T'] == 1, 'X'], + train.loc[train['decision_T'] == 1, 'result_Y'], test['X'], 0) + + test = test.assign(B_prob_0_model=predictions) + + test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) + + kk_array = pd.qcut(test['B_prob_0_model'], group_amount, labels=False) + + # Find observed values + observed = test['decision_T'] == 1 + + # Assign data to the model + dat = dict(D=1, + N_obs=np.sum(observed), + N_cens=np.sum(~observed), + K=group_amount, + sigma_tau=sigma_tau, + M=len(set(df_unlabeled['judgeID_J'])), + jj_obs=test.loc[observed, 'judgeID_J']+1, + jj_cens=test.loc[~observed, 'judgeID_J']+1, + kk_obs=kk_array[observed]+1, + kk_cens=kk_array[~observed]+1, + dec_obs=test.loc[observed, 'decision_T'], + dec_cens=test.loc[~observed, 'decision_T'], + X_obs=test.loc[observed, 'B_prob_0_model'].values.reshape(-1,1), + X_cens=test.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1), + y_obs=test.loc[observed, 'result_Y'].astype(int)) + + fit = sm.sampling(data=dat, chains=5, iter=4000, control = dict(adapt_delta=0.9)) + + pars = fit.extract() + + plt.figure(figsize=(15,30)) + + fit.plot(); + + if save: + plt.savefig(save_name + '_stan_diagnostic_plot') + + plt.show() + plt.close('all') + + if save: + print(fit, file=open(save_name + '_stan_fit_diagnostics.txt', 'w')) + + # Bayes + + # Alusta matriisi, rivillä yksi otos posteriorista + # sarakkeet havaintoja + y_imp = np.ones((pars['y_est'].shape[0], test.shape[0])) + + # Täydennetään havaitsemattomat estimoiduilla + y_imp[:, ~observed] = 1-pars['y_est'] + + # Täydennetään havaitut havaituilla + y_imp[:, observed] = 1-test.loc[observed, 'result_Y'] + + Rs = np.arange(.1, .9, .1) + + to_release_list = np.round(test.shape[0] * Rs).astype(int) + + f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan) + + for i in range(len(to_release_list)): + est_failure_rates = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) / test.shape[0] + + f_rate_bayes[:, i] = est_failure_rates + + failure_rates[i, 6] = np.mean(est_failure_rates) + + for i in range(nIter): + + print(" [", i, "] ", sep='', end="") + + for r in np.arange(1, 9): + + print(".", end="") + + # True evaluation + + f_rate_true[i, r - 1] = trueEvaluationEvaluator( + df_unlabeled, 'X', 'decision_T', 'result_Y', r / 10) + + # Labeled outcomes only + + f_rate_label[i, r - 1] = labeledOutcomesEvaluator( + df_labeled, 'X', 'decision_T', 'result_Y', r / 10) + + # Adjusted labeled outcomes + + f_rate_label_adj[i, r - 1] = labeledOutcomesEvaluator( + df_labeled, + 'X', + 'decision_T', + 'result_Y', + r / 10, + adjusted=True) + + # Human evaluation + + f_rate_human[i, r - 1] = humanEvaluationEvaluator( + df_labeled, 'judgeID_J', 'decision_T', 'result_Y', + 'acceptanceRate_R', r / 10) + + # Contraction + + f_rate_cont[i, r - 1] = contractionEvaluator( + df_labeled, 'X', 'judgeID_J', 'decision_T', 'result_Y', + 'acceptanceRate_R', r / 10) + + # Causal model - analytic solution + + f_rate_caus[i, r - 1] = monteCarloEvaluator( + df_labeled, 'X', 'decision_T', 'result_Y', 'acceptanceRate_R', + r / 10) + + failure_rates[:, 0] = np.mean(f_rate_true, axis=0) + failure_rates[:, 1] = np.mean(f_rate_label, axis=0) + failure_rates[:, 2] = np.mean(f_rate_label_adj, axis=0) + failure_rates[:, 3] = np.mean(f_rate_human, axis=0) + failure_rates[:, 4] = np.mean(f_rate_cont, axis=0) + failure_rates[:, 5] = np.mean(f_rate_caus, axis=0) + #failure_rates[:, 6] = f_rate_bayes + + failure_sems[:, 0] = scs.sem(f_rate_true, axis=0) + failure_sems[:, 1] = scs.sem(f_rate_label, axis=0) + failure_sems[:, 2] = scs.sem(f_rate_label_adj, axis=0) + failure_sems[:, 3] = scs.sem(f_rate_human, axis=0) + failure_sems[:, 4] = scs.sem(f_rate_cont, axis=0) + failure_sems[:, 5] = scs.sem(f_rate_caus, axis=0) + failure_sems[:, 6] = scs.sem(f_rate_bayes, axis=0, nan_policy='omit') + + x_ax = np.arange(0.1, 0.9, 0.1) + + labels = [ + 'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.', + 'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes' + ] + colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c'] + + for i in range(failure_rates.shape[1]): + plt.errorbar(x_ax, + failure_rates[:, i], + label=labels[i], + c=colours[i], + yerr=failure_sems[:, i]) + + plt.title('Failure rate vs. Acceptance rate') + plt.xlabel('Acceptance rate') + plt.ylabel('Failure rate') + plt.legend() + plt.grid() + + if save: + plt.savefig(save_name + '_all') + + plt.show() + + print("\nFailure rates:") + print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) + + print("\nMean absolute errors:") + for i in range(1, failure_rates.shape[1]): + print( + labels[i].ljust(len(max(labels, key=len))), + np.round( + np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5)) + + drawDiagnostics(title=title, + save_name=save_name, + save=save, + f_rates=[ + f_rate_true, f_rate_label, f_rate_label_adj, + f_rate_human, f_rate_cont, f_rate_caus, f_rate_bayes + ], + titles=labels) + + +sm = pystan.StanModel(file=stan_code_file_name) + +if which == 1: + print("Without unobservables (Bernoulli + independent decisions)") + + dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim) + + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1) + + perfComp( + dg, lambda x: decider(x), + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z" + ) + +gc.collect() +plt.close('all') + +print("With unobservables in the data") + +if which == 2: + print("\nBernoulli + independent decisions") + + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) + + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) + + perfComp( + dg, lambda x: decider(x), + "Fluctuation of failure rate estimates across iterations \n" + + "Bernoulli + independent decisions, with unobservables", + figure_path + "sl_bernoulli_independent_with_Z", + ) + +gc.collect() +plt.close('all') + +if which == 3: + print("\nThreshold rule + independent decisions") + + dg = lambda: thresholdDGWithUnobservables(N_total=N_sim) + + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) + + perfComp( + dg, lambda x: decider(x), + "Fluctuation of failure rate estimates across iterations \n" + + "Threshold rule + independent decisions, with unobservables", + figure_path + "sl_threshold_independent_with_Z", + ) + +gc.collect() +plt.close('all') + +if which == 4: + print("\nBernoulli + non-independent (batch) decisions") + + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) + + decider = lambda x: humanDeciderLakkaraju( + x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) + + perfComp( + dg, lambda x: decider(x), + "Fluctuation of failure rate estimates across iterations \n" + + "Bernoulli + non-independent decisions, with unobservables", + figure_path + "sl_bernoulli_batch_with_Z", + ) + +gc.collect() +plt.close('all') + +if which == 5: + print("\nThreshold rule + non-independent (batch) decisions") + + dg = lambda: thresholdDGWithUnobservables(N_total=N_sim) + + decider = lambda x: humanDeciderLakkaraju( + x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) + + perfComp( + dg, lambda x: decider(x), + "Fluctuation of failure rate estimates across iterations \n" + + "Threshold rule + non-independent decisions, with unobservables", + figure_path + "sl_threshold_batch_with_Z", + ) + +gc.collect() +plt.close('all') + +if which == 6: + print("\nRandom decider") + + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) + + decider = lambda x: randomDecider( + x, nJudges_M=M_sim, use_acceptance_rates=True) + + perfComp( + dg, lambda x: decider(x), + "Bernoulli + random decider with leniency and unobservables", + figure_path + "sl_random_decider_with_Z", + ) + +gc.collect() +plt.close('all') + +if which == 7: + print("\nBiased decider") + + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) + + decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True) + + perfComp( + dg, lambda x: decider(x), + "Bernoulli + biased decider with leniency and unobservables", + figure_path + "sl_biased_decider_with_Z", + ) + + +if which == 8: + print("\nBad judge") + + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) + + decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim) + + perfComp( + dg, lambda x: decider(x), + "Bernoulli + 'bad' decider with leniency and unobservables", + figure_path + "sl_bad_decider_with_Z" + ) + +gc.collect() +plt.close('all') + +if which == 9: + print("\nBernoulli + Bernoulli") + + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) + + decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim) + + perfComp( + dg, lambda x: decider(x), + "Bernoulli + Bernoulli", + figure_path + "sl_bernoulli_bernoulli_with_Z", + ) + +if which == 10: + print("\nBeta_Z = 3, Threshold + batch") + + dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0) + + decider = lambda x: humanDeciderLakkaraju( + x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True) + + perfComp( + dg, lambda x: decider(x), + "Beta_Z = 3, threshold + batch", + figure_path + "sl_threshold_batch_beta_Z_3_with_Z", + ) + +if which == 11: + print("\nBeta_Z = 5, Threshold + batch") + + dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0) + + decider = lambda x: humanDeciderLakkaraju( + x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True) + + perfComp( + dg, lambda x: decider(x), + "Beta_Z = 5, threshold + batch", + figure_path + "sl_threshold_batch_beta_Z_5_with_Z", + ) \ No newline at end of file