From 549f74302420e3042a9cd82261dad6db6ed74c85 Mon Sep 17 00:00:00 2001 From: Riku-Laine <28960190+Riku-Laine@users.noreply.github.com> Date: Mon, 20 Jan 2020 11:53:19 +0200 Subject: [PATCH] Updated stan script --- .../stan_modelling_empirical.py | 297 ++++++++++-------- 1 file changed, 167 insertions(+), 130 deletions(-) diff --git a/analysis_and_scripts/stan_modelling_empirical.py b/analysis_and_scripts/stan_modelling_empirical.py index 5423a56..2b62c3c 100644 --- a/analysis_and_scripts/stan_modelling_empirical.py +++ b/analysis_and_scripts/stan_modelling_empirical.py @@ -1,19 +1,19 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -# Author: Riku Laine -# Date: 12AUG2019 -# Project name: Potential outcomes in model evaluation -# Description: This script creates the figures and results used -# in empirical data experiments with COMPAS data. -# -# Parameters: -# ----------- -# (1) save_name : file path and prefix for saving the created figures. -# (2) group_amount : How many groups if the non-linear model is used. -# (3) stan_code_file_name : Name of file containing the stan model code. -# (4) sigma_tau : Values of prior variance for the Jung-inspired model. -# (5) data_path : Path to data file. +Author: Riku Laine +Date: 12AUG2019 / Modified 30DEC2019 +Project name: Potential outcomes in model evaluation +Description: This script creates the figures and results used + in empirical data experiments with COMPAS data. + +Parameters: +----------- +(1) save_name : file path and prefix for saving the created figures. +(2) stan_code_file_name : Name of file containing the stan model code. +(3) sigma_tau : Values of prior variance for the Jung-inspired model. +(4) data_path : Path to data file. +(5) nJudges_M : Number of judges in simulations. """ import numpy as np @@ -35,31 +35,33 @@ plt.rcParams.update({'figure.figsize': (10, 6)}) import sys # Seed for reproducibility -npr.seed(123) +npr.seed(111) # figure storage name -save_name = sys.argv[1] + "sl_compas" - -# How many groups if jung model is used -group_amount = int(sys.argv[2]) +save_name = sys.argv[1] + "sl_compas_" # Name of stan model code file -stan_code_file_name = sys.argv[3] +stan_code_file_name = sys.argv[2] # Variance prior -sigma_tau = float(sys.argv[4]) +sigma_tau = float(sys.argv[3]) # figure storage name -data_path = sys.argv[5] +data_path = sys.argv[4] + +nJudges_M = int(sys.argv[5]) # Prefix for the figures and log files print("These results have been obtained with the following settings:") -print("Number of groups:", group_amount) - print("Prior for the variances:", sigma_tau) +def standardize(x): + '''Standardization function''' + + return (x - np.mean(x)) / np.std(x, ddof=1) + ########################## # ## Evaluator modules @@ -68,19 +70,19 @@ print("Prior for the variances:", sigma_tau) def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None): ''' - Fit a predictive model (default logistic regression) with given training - instances and return probabilities for test instances to obtain a given + Fit a predictive model (default logistic regression) with given training + instances and return probabilities for test instances to obtain a given class label. - + Arguments: ---------- - + x_train -- x values of training instances y_train -- y values of training instances x_test -- x values of test instances class_value -- class label for which the probabilities are counted for. model_type -- type of model to be fitted. - + Returns: -------- (1) Trained predictive model @@ -130,13 +132,13 @@ def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None): def getProbabilityForClass(x, model, class_value): ''' - Function (wrapper) for obtaining the probability of a class given x and a + Function (wrapper) for obtaining the probability of a class given x and a predictive model. Arguments: ----------- x -- individual features, an array of shape (observations, features) - model -- a trained sklearn model. Predicts probabilities for given x. + model -- a trained sklearn model. Predicts probabilities for given x. Should accept input of shape (observations, features) class_value -- the resulting class to predict (usually 0 or 1). @@ -155,8 +157,8 @@ def getProbabilityForClass(x, model, class_value): # ### Contraction algorithm -# -# Below is an implementation of Lakkaraju's team's algorithm presented in +# +# Below is an implementation of Lakkaraju's team's algorithm presented in # [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant # parameters to be passed to the function are presented in the description. @@ -164,7 +166,7 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, accRateR_col, r): ''' This is an implementation of the algorithm presented by Lakkaraju - et al. in their paper "The Selective Labels Problem: Evaluating + et al. in their paper "The Selective Labels Problem: Evaluating Algorithmic Predictions in the Presence of Unobservables" (2017). Arguments: @@ -177,7 +179,7 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, resultY_col -- String, the name of the column containing the realization modelProbS_col -- String, the name of the column containing the probability scores from the black-box model B. - accRateR_col -- String, the name of the column containing the judges' + accRateR_col -- String, the name of the column containing the judges' acceptance rates r -- Float between 0 and 1, the given acceptance rate. @@ -192,25 +194,28 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() ##### Agreement rate computation begins ####### - - max_leniency = max(df[accRateR_col]) - + + Q_leniency = max(df[accRateR_col]) + + if Q_leniency < r: + return np.nan, np.nan + # Sort D_q # "Observations deemed as high risk by B are at the top of this list" D_sort_q = D_q.sort_values(by=modelProbS_col, ascending=False) - + # The agreement rate is now computed as the percentage of all the positive # decisions given by q in the 1-r % of the most dangerous subjects. all_negatives = np.sum(D_sort_q[decisionT_col] == 0) - - n_most_dangerous = int(round(D_q.shape[0] * (1-max_leniency))) - - agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous]) + + n_most_dangerous = int(round(D_q.shape[0] * (1-Q_leniency))) + + agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous] == 0) print("The agreement rate of contraction is:", agreed_decisions/all_negatives) - + ##### Agreement rate computation ends ####### - + # All observations of R_q have observed outcome labels. # "R_q is the set of observations in D_q with observed outcome labels." R_q = D_q[D_q[decisionT_col] == 1].copy() @@ -236,7 +241,7 @@ def trueEvaluationEvaluator(df, resultY_col, r): df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) to_release = int(round(df.shape[0] * r)) - + failed = df[resultY_col][0:to_release] == 0 return np.sum(failed) / df.shape[0], df.shape[0] @@ -251,7 +256,7 @@ def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False): ascending=True) to_release = int(round(df_observed.shape[0] * r)) - + failed = df_observed[resultY_col][0:to_release] == 0 if adjusted: @@ -266,13 +271,12 @@ def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False): compas_raw = pd.read_csv(data_path) # Select columns -compas = compas_raw[[ - 'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', - 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', - 'two_year_recid', 'c_jail_in', 'c_jail_out' -]] +compas = compas_raw[['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', + 'sex','priors_count', 'days_b_screening_arrest', + 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', + 'c_jail_out']] -# Subset values, see reasons in ProPublica methodology. +# Filter values, see ProPublica methodology. compas = compas.query('days_b_screening_arrest <= 30 and \ days_b_screening_arrest >= -30 and \ is_recid != -1 and \ @@ -286,30 +290,39 @@ compas = compas[compas.score_text.notnull()] compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1) # Convert string values to dummies, drop first to keep full rank. -compas_dummy = pd.get_dummies( - compas, - columns=['c_charge_degree', 'race', 'age_cat', 'sex'], - drop_first=True) +compas_dummy = pd.get_dummies(compas, + columns=['c_charge_degree', 'race', 'age_cat', 'sex'], + drop_first=True) -compas_dummy.drop(columns=[ - 'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', - 'two_year_recid', 'score_text', 'is_recid' -], - inplace=True) +compas_dummy.drop(columns=['age', 'days_b_screening_arrest', 'c_jail_in', + 'c_jail_out', 'two_year_recid', 'score_text', + 'is_recid'], inplace=True) # Shuffle rows for random judge assignment compas_shuffled = compas_dummy.sample(frac=1) -nJudges_M = 10 +# Decision assignment + +# Generate leniencies from U(0.1, 0.9) for all 16 judges. Then distribute the +# observations evenly to all decision-makers and give positive decisions to +# r% of subjects with the lowest COMPAS score. + +# nJudges_M = 16 +# nJudges_M = 12 # Assign judges as evenly as possible judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False) -# Assign fixed leniencies from 0.1 to 0.9 -judge_leniency = np.array([0.1, 0.3, 0.5, 0.7, 0.9]*2) +# Draw leniencies +# judge_leniency = npr.uniform(0.1, 0.9, nJudges_M) +# Assign leniencies +judge_leniency = np.repeat([0.1, 0.5, 0.9], int(nJudges_M/3)) + +# Replicate values for assignment judge_leniency = judge_leniency[judge_ID] +# Attach to data compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID, judge_leniency=judge_leniency) @@ -340,20 +353,27 @@ compas_unlabeled = compas_shuffled.copy() # Hide unobserved compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan -# Choose feature_columns +# Choose feature_columns for the regression (priors, charge degree, race, age +# and sex). feature_cols = ~compas_labeled.columns.isin( ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T']) feature_cols = compas_labeled.columns[feature_cols] +# Standardize priors count + +compas_labeled.priors_count = standardize(compas_labeled.priors_count) ####### Draw figures ########### def drawDiagnostics(title, save_name, f_rates, titles): - + ''' + Draw boxplot figures. One panel for each method. + ''' + cols = 2 rows = np.ceil(len(f_rates) / cols) - + plt.figure(figsize=(16, 4.5*rows+1)) ax = plt.subplot(rows, cols, 1) @@ -383,6 +403,8 @@ def drawDiagnostics(title, save_name, f_rates, titles): plt.savefig(save_name + '_diagnostic_plot') plt.show() + plt.close('all') + nIter = 10 @@ -398,87 +420,89 @@ f_rate_cf = np.zeros((nIter, 8)) sm = Model(stan_file=stan_code_file_name) sm.compile() +# Split the nJudges_M judges to two datasets nIter times. + for i in range(nIter): - - ids = np.arange(5) - selected = ids + npr.choice([0, 5], 5) - + + # Split data using judge IDs. + + # Get number of judges + id_max = np.max(compas_labeled.judge_ID) + 1 + + # Choose random judges by id + selected = npr.choice(np.arange(id_max), int(id_max/2), False) + test_labeled = compas_labeled[compas_labeled.judge_ID.isin(selected)] train = compas_labeled[~compas_labeled.judge_ID.isin(selected)] # Renew IDs - test_labeled.loc[:, 'judge_ID'] = test_labeled.judge_ID % 5 - + test_labeled.loc[:, 'judge_ID'] = test_labeled.groupby('judge_ID').ngroup() + # Assign same observations to unlabeled data test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values] - - + # Train a logistic regression model B_model, predictions = fitPredictiveModel( train.loc[train['decision_T'] == 1, feature_cols], train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0) - + # Attach predictions to data test_labeled = test_labeled.assign(B_prob_0_model=predictions) test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions) - + test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False) - + # Find observed values observed = test_labeled['decision_T'] == 1 - + # Assign data to the model dat = dict(D=int(10), N_obs=int(np.sum(observed)), N_cens=int(np.sum(~observed)), - K=int(group_amount), sigma_tau=sigma_tau, M=int(len(set(test_labeled['judge_ID']))), jj_obs=test_labeled.loc[observed, 'judge_ID']+1, jj_cens=test_labeled.loc[~observed, 'judge_ID']+1, - kk_obs=kk_array[observed]+1, - kk_cens=kk_array[~observed]+1, dec_obs=test_labeled.loc[observed, 'decision_T'], dec_cens=test_labeled.loc[~observed, 'decision_T'], - X_obs=test_labeled.loc[observed, feature_cols.values], - X_cens=test_labeled.loc[~observed, feature_cols.values], + X_obs=test_labeled.loc[observed, feature_cols.values].values, + X_cens=test_labeled.loc[~observed, feature_cols.values].values, y_obs=test_labeled.loc[observed, 'result_Y'].astype(int)) - - # Do the sampling - fit = sm.sample(data=dat, chains=4, seed=123, warmup_iters=1500, - sampling_iters=4500, adapt_delta=0.975) + + + # Do the sampling + fit = sm.sample(data=dat, chains=4, seed=123, warmup_iters=1500, + sampling_iters=4000, adapt_delta=0.975) # Extract predictions fit_colnames = fit.column_names - print("Colnames", fit.column_names) - pred_cols = [s for s in fit_colnames if 'y_est' in s] - + pars = fit.get_drawset(params=pred_cols) - + fit.get_drawset(params=pred_cols) - print("Drawset:", fit.get_drawset()) - plt.close('all') gc.collect() gc.collect() print(fit.diagnose(), file=open(save_name + '_stan_fit_diagnostics_' + str(i) + '.txt', 'w')) - + + # Draw figures for every 5th iteration + if i % 5 == 0: + # Save fit object to csv + fit.save_csvfiles(dir=save_name, basename='fit_obj_i_' + str(i)) # Bayes - + # Format matrix, each row is a sample of the posterior # columns are observations - + #y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0])) y_imp = np.ones((pars.shape[0], test_labeled.shape[0])) - # Impute the unobserved with the estimated + # Impute the unobserved with the estimated # Reverse the binary coding to compute failures as sum. 0 == 0 = 1 #y_imp[:, ~observed] = 1 - pars['y_est'] y_imp[:, ~observed] = 1 - pars @@ -491,53 +515,53 @@ for i in range(nIter): gc.collect() gc.collect() - + process = psutil.Process(os.getpid()) - print("Memory use after deletion:", process.memory_info().rss) # in bytes - + print("Memory use after deletion:", process.memory_info().rss) # in bytes + # Impute the observed y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y'] - + for r in range(1, 9): - + to_release = np.round(test_labeled.shape[0] * (r / 10)).astype(int) - + failed = np.sum(y_imp[:, 0:to_release], axis=1) - + f_rate_cf[i, r - 1] = np.mean(failed / test_labeled.shape[0]) - + print(".", end="") - + # True evaluation - - f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled, + + f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled, 'result_Y', r / 10) - + # Labeled outcomes only - - f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled, + + f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10) - + # Contraction - - f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judge_ID', - 'decision_T', 'result_Y', 'B_prob_0_model', + + f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judge_ID', + 'decision_T', 'result_Y', 'B_prob_0_model', 'judge_leniency', r / 10) - -failure_rates[:, 0] = np.mean(f_rate_true, axis=0) -failure_rates[:, 1] = np.mean(f_rate_label, axis=0) -failure_rates[:, 2] = np.mean(f_rate_cont, axis=0) -failure_rates[:, 3] = np.mean(f_rate_cf, axis=0) -failure_stds[:, 0] = np.std(f_rate_true, axis=0) -failure_stds[:, 1] = np.std(f_rate_label, axis=0) -failure_stds[:, 2] = np.std(f_rate_cont, axis=0) -failure_stds[:, 3] = np.std(f_rate_cf, axis=0) +failure_rates[:, 0] = np.nanmean(f_rate_true, axis=0) +failure_rates[:, 1] = np.nanmean(f_rate_label, axis=0) +failure_rates[:, 2] = np.nanmean(f_rate_cont, axis=0) +failure_rates[:, 3] = np.nanmean(f_rate_cf, axis=0) + +failure_stds[:, 0] = np.nanstd(f_rate_true, axis=0, ddof=1) +failure_stds[:, 1] = np.nanstd(f_rate_label, axis=0, ddof=1) +failure_stds[:, 2] = np.nanstd(f_rate_cont, axis=0, ddof=1) +failure_stds[:, 3] = np.nanstd(f_rate_cf, axis=0, ddof=1) x_ax = np.arange(0.1, 0.9, 0.1) -labels = ['True Evaluation', 'Labeled outcomes', 'Contraction', +labels = ['True Evaluation', 'Labeled outcomes', 'Contraction', 'Counterfactuals'] colours = ['g', 'magenta', 'b', 'r'] @@ -562,12 +586,14 @@ plt.savefig(save_name + '_all') plt.show() +plt.close('all') + # Plot error of the methods (contraction and counterfactual) # w.r.t True evaluation for i in range(2, failure_rates.shape[1]): error = np.abs(failure_rates[:, 0] - failure_rates[:, i]) - + plt.errorbar(x_ax, error, label=labels[i], @@ -575,6 +601,7 @@ for i in range(2, failure_rates.shape[1]): linestyle=line_styles[i], yerr=failure_stds[:, i]) +plt.axhline(y=0, c='k') plt.xlabel('Acceptance rate') plt.ylabel('Error w.r.t. True evaluation') plt.legend() @@ -584,12 +611,22 @@ plt.savefig(save_name + '_all_err') plt.show() +plt.close('all') + +print("\nSingle runs:") +print(f_rate_true, f_rate_label, f_rate_cont, f_rate_cf) + print("\nFailure rates:") print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) print("\nStandard deviations:") print(np.array2string(failure_stds, formatter={'float_kind':lambda x: "%.5f" % x})) +# Save arrays +np.save(save_name + "_true_FRs", f_rate_true) +np.save(save_name + "_labeled_FRs", f_rate_label) +np.save(save_name + "_contraction_FRs", f_rate_cont) +np.save(save_name + "_counterfactuals_FRs", f_rate_cf) print("\nMean absolute errors:") for i in range(1, failure_rates.shape[1]): @@ -601,4 +638,4 @@ for i in range(1, failure_rates.shape[1]): drawDiagnostics(title="Variation of failure rate estimates by method\nCOMPAS data set", save_name=save_name, f_rates=[f_rate_true, f_rate_label, f_rate_cont, f_rate_cf], - titles=labels) \ No newline at end of file + titles=labels) -- GitLab