diff --git a/analysis_and_scripts/README.md b/analysis_and_scripts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9624a6b01691b731a375552e467b9b21173562a0 --- /dev/null +++ b/analysis_and_scripts/README.md @@ -0,0 +1,12 @@ +# Files in this folder + +The following files are in this folder: +* Jupyter notebooks are the old analysis files. +* notes.tex contains much of the research done during the summer. +* Stan files: (Stan is a language for Bayesian modelling) + * code_linear_dependency - Code for the model with linear conncetion between the explanatory and dependent variable. + * code_nonlinear_dependency - Code for model with groups. See Jung's arXiv preprint for more info. +* Stan_modelling_* python files are for creating the figures. Empirical does the analysis for the COMPAS data and theoretic for the synthetic data. See parameters from docstrings. +* truth_tables.py is an old script from the original repository. + +Models can be run on the cluster by running the batch file (suffix .job) with the sbatch command. \ No newline at end of file diff --git a/analysis_and_scripts/code_full.stan b/analysis_and_scripts/code_full.stan deleted file mode 100644 index 5d484a345dc7c5d93fa2a7109c96efbc8b9da545..0000000000000000000000000000000000000000 --- a/analysis_and_scripts/code_full.stan +++ /dev/null @@ -1,73 +0,0 @@ -data { - int<lower=1> D; - int<lower=0> N_obs; - int<lower=0> N_cens; - int<lower=0> M; - real<lower=0> sigma_tau; - int<lower=1, upper=M> jj_obs[N_obs]; // judge_ID - int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID - int<lower=0, upper=1> dec_obs[N_obs]; - int<lower=0, upper=1> dec_cens[N_cens]; - row_vector[D] X_obs[N_obs]; - row_vector[D] X_cens[N_cens]; - int<lower=0, upper=1> y_obs[N_obs]; -} - -parameters { - vector[N_obs] Z_obs; - vector[N_cens] Z_cens; - real alpha_T[M]; - vector[D] beta_XT_raw; - vector[D] beta_XY_raw; - real<lower=0> beta_ZT_raw; // jungimainen oletus latentin pos. vaik. - real<lower=0> beta_ZY_raw; // jungimainen oletus latentin pos. vaik. - - real<lower=0> tau_XT; - real<lower=0> tau_XY; - real<lower=0> tau_ZT; - real<lower=0> tau_ZY; - -} - -transformed parameters { - vector[D] beta_XT; - vector[D] beta_XY; - real<lower=0> beta_ZT; // jungimainen oletus latentin pos. vaik. - real<lower=0> beta_ZY; // jungimainen oletus latentin pos. vaik. - - beta_XT = tau_XT * beta_XT_raw; - beta_XY = tau_XY * beta_XY_raw; - beta_ZT = tau_ZT * beta_ZT_raw; - beta_ZY = tau_ZY * beta_ZY_raw; -} - -model { - Z_obs ~ normal(0, 1); - Z_cens ~ normal(0, 1); - - tau_XT ~ normal(0, sigma_tau); - tau_XY ~ normal(0, sigma_tau); - tau_ZT ~ normal(0, sigma_tau); - tau_ZY ~ normal(0, sigma_tau); - - beta_XT_raw ~ normal(0, 1); - beta_XY_raw ~ normal(0, 1); - beta_ZT_raw ~ normal(0, 1); - beta_ZY_raw ~ normal(0, 1); - - for(i in 1:N_obs){ - dec_obs[i] ~ bernoulli_logit(alpha_T[jj_obs[i]] + X_obs[i] * beta_XT + beta_ZT * Z_obs[i]); - y_obs[i] ~ bernoulli_logit(X_obs[i] * beta_XY + beta_ZY * Z_obs[i]); - } - - for(i in 1:N_cens) - dec_cens[i] ~ bernoulli_logit(alpha_T[jj_cens[i]] + X_cens[i] * beta_XT + beta_ZT * Z_cens[i]); -} - -generated quantities { - int<lower=0, upper=1> y_est[N_cens]; - - for(i in 1:N_cens){ - y_est[i] = bernoulli_logit_rng(X_cens[i] * beta_XY + beta_ZY * Z_cens[i]); - } -} diff --git a/analysis_and_scripts/code_linear_dependency.stan b/analysis_and_scripts/code_linear_dependency.stan new file mode 100644 index 0000000000000000000000000000000000000000..e17716209dd52d3ba048a6c405391333706c79d3 --- /dev/null +++ b/analysis_and_scripts/code_linear_dependency.stan @@ -0,0 +1,101 @@ +data { + int<lower=1> D; // Dimensions of the features an coefficient vectors + int<lower=0> N_obs; // Number of "observed observations" (with T = 1) + int<lower=0> N_cens; // Number of "censored observations" (with T = 0) + int<lower=0> M; // Number of judges + int<lower=1, upper=M> jj_obs[N_obs]; // judge_ID array + int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID array + int<lower=0, upper=1> dec_obs[N_obs]; // Decisions for the observed observations + int<lower=0, upper=1> dec_cens[N_cens]; // Decisions for the censored observations + row_vector[D] X_obs[N_obs]; // Features of the observed observations + row_vector[D] X_cens[N_cens]; // Features of the censored observations + int<lower=0, upper=1> y_obs[N_obs]; // Outcomes of the observed observations +} + +parameters { + // Latent variable + vector[N_obs] Z_obs; + vector[N_cens] Z_cens; + + // Intercepts + real alpha_T[M]; + real alpha_Y; + + // Temporary variables to compute the coefficients + vector[D] a_XT; + vector[D] a_XY; + real<lower=0> a_ZT; // Presume latent variable has a positive coefficient. + real<lower=0> a_ZY; + + real<lower=0> tau_XT; + real<lower=0> tau_XY; + real<lower=0> tau_ZT; + real<lower=0> tau_ZY; + +} + +transformed parameters { + + // Coefficients + vector[D] beta_XT; + vector[D] beta_XY; + real<lower=0> beta_ZT; // Presume latent variable has a positive coefficient. + real<lower=0> beta_ZY; + + beta_XT = a_XT / sqrt(tau_XT); + beta_XY = a_XY / sqrt(tau_XY); + beta_ZT = a_ZT / sqrt(tau_ZT); + beta_ZY = a_ZY / sqrt(tau_ZY); +} + +model { + // Latent variable + Z_obs ~ normal(0, 1); + Z_cens ~ normal(0, 1); + + // Intercepts + alpha_T ~ normal(0, 5); + alpha_Y ~ normal(0, 5); + + // According to + // https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations + // section "Prior for the regression coefficients in logistic regression + // (non-sparse case)" a good prior is Student's with parameters 3>nu>7, + // 0 and 1. + + // Also, according to + // https://mc-stan.org/docs/2_19/stan-users-guide/reparameterization-section.html + // reparametrizing beta ~ student_t(nu, 0, 1) can be done by sampling + // a ~ N(0, 1) and tau ~ gamma(nu/2, nu/2) so then beta = a * tau^(-1/2). + // Reparametrizing is done to achieve better mixing. + + a_XT ~ normal(0, 1); + a_XY ~ normal(0, 1); + a_ZT ~ normal(0, 1); + a_ZY ~ normal(0, 1); + + // nu = 5 -> nu/2 = 2.5 + tau_XT ~ gamma(2.5, 2.5); + tau_XY ~ gamma(2.5, 2.5); + tau_ZT ~ gamma(2.5, 2.5); + tau_ZY ~ gamma(2.5, 2.5); + + // Compute the regressions for the observed observations + for(i in 1:N_obs){ + dec_obs[i] ~ bernoulli_logit(alpha_T[jj_obs[i]] + X_obs[i] * beta_XT + beta_ZT * Z_obs[i]); + y_obs[i] ~ bernoulli_logit(alpha_Y + X_obs[i] * beta_XY + beta_ZY * Z_obs[i]); + } + + // Compute the regression for the censored observations + for(i in 1:N_cens) + dec_cens[i] ~ bernoulli_logit(alpha_T[jj_cens[i]] + X_cens[i] * beta_XT + beta_ZT * Z_cens[i]); +} + +generated quantities { + int<lower=0, upper=1> y_est[N_cens]; + + // Generate a draw from the posterior predictive. + for(i in 1:N_cens){ + y_est[i] = bernoulli_logit_rng(alpha_Y + X_cens[i] * beta_XY + beta_ZY * Z_cens[i]); + } +} diff --git a/analysis_and_scripts/code_jung_binarized.stan b/analysis_and_scripts/code_nonlinear_dependency.stan similarity index 84% rename from analysis_and_scripts/code_jung_binarized.stan rename to analysis_and_scripts/code_nonlinear_dependency.stan index 259293a262b9ec531d7a270724b30d54526b070c..1e6a42e28f6724931bef43fde338091f3fbf5a77 100644 --- a/analysis_and_scripts/code_jung_binarized.stan +++ b/analysis_and_scripts/code_nonlinear_dependency.stan @@ -4,7 +4,7 @@ data { int<lower=0> N_cens; // Number of "censored observations" (with T = 0) int<lower=1> M; // Number of judges int<lower=1> K; // Number of groups - real<lower=0> sigma_tau; + real<lower=0> sigma_tau; // Prior for the variance parameters. int<lower=1, upper=M> jj_obs[N_obs]; // judge_ID int<lower=1, upper=M> jj_cens[N_cens]; // judge_ID int<lower=1, upper=K> kk_obs[N_obs]; // Grouping @@ -17,15 +17,19 @@ data { } parameters { + // Latent variable vector[N_obs] Z_obs; vector[N_cens] Z_cens; + real alpha_T[M]; // Judge-specific intercepts + // Coefficients vector[D] beta_XT_raw[K]; vector[D] beta_XY_raw[K]; - vector<lower=0>[K] beta_ZT_raw; // Coefficient for the latent variable. - vector<lower=0>[K] beta_ZY_raw; // Coefficient for the latent variable. + // Coefficient for the latent variable presumed to be positive + vector<lower=0>[K] beta_ZT_raw; + vector<lower=0>[K] beta_ZY_raw; real<lower=0> tau_XT; real<lower=0> tau_XY; @@ -34,6 +38,11 @@ parameters { } transformed parameters{ + // Here we transform the _raw parameters to be random-walk + // or cumulative random-walk. See Jung's arXiv paper for more + // information. We extended this by specifying that judges have + // separate intercepts. + vector[D] beta_XT[K]; vector[D] beta_XY[K]; @@ -47,7 +56,9 @@ transformed parameters{ beta_XT[1] = beta_XT_raw[1]; beta_XY[1] = beta_XY_raw[1]; - for(i in 2:K){ // random walk prior here + for(i in 2:K){ + // random walk prior, value of next group's coefficient + // depends on the previous. beta_XT[i] = tau_XT * beta_XT_raw[i-1]; // ith group beta_XY[i] = tau_XY * beta_XY_raw[i-1]; } diff --git a/analysis_and_scripts/parallel_model_job.job b/analysis_and_scripts/parallel_model_job.job new file mode 100644 index 0000000000000000000000000000000000000000..49cba7d371fd5057913df73376f11bcbfa425238 --- /dev/null +++ b/analysis_and_scripts/parallel_model_job.job @@ -0,0 +1,12 @@ +#!/bin/bash +#SBATCH --job-name=sl_stan_model_parallel +#SBATCH --workdir=/wrk/users/rikulain/ +#SBATCH -c 5 +#SBATCH -t 30:00:00 +#SBATCH --mem=45G +#SBATCH --mail-type=ALL +#SBATCH --mail-user=riku.laine@helsinki.fi + +srun hostname + +time python /proj/rikulain/stan_modelling_theoretic.py "/wrk/users/rikulain/figures/results_${1}_" 25000 50 $1 1 /proj/rikulain/code_full.stan 1 diff --git a/analysis_and_scripts/stan_modelling_empirical.py b/analysis_and_scripts/stan_modelling_empirical.py index 6ff23e3a0bb8b142018673854e33232dfa657043..9468f140387c3088242c5bddfb89c3330fdaa956 100644 --- a/analysis_and_scripts/stan_modelling_empirical.py +++ b/analysis_and_scripts/stan_modelling_empirical.py @@ -2,66 +2,59 @@ # -*- coding: utf-8 -*- """ # Author: Riku Laine -# Date: 26JUL2019 +# Date: 12AUG2019 # Project name: Potential outcomes in model evaluation # Description: This script creates the figures and results used -# in empirical data experiments. +# in empirical data experiments with COMPAS data. # # Parameters: # ----------- -# (1) figure_path : file name for saving the created figures. -# (2) nIter : Number of train test splits to perform on the data. -# (3) group_amount : How many groups if Jung-inspired model is used. -# (4) stan_code_file_name : Name of file containing the stan model code. -# (5) sigma_tau : Values of prior variance for the Jung-inspired model. -# (6) data_path : File of compas data. - +# (1) save_name : file path and prefix for saving the created figures. +# (2) group_amount : How many groups if the non-linear model is used. +# (3) stan_code_file_name : Name of file containing the stan model code. +# (4) sigma_tau : Values of prior variance for the Jung-inspired model. +# (5) data_path : Path to data file. """ import numpy as np import pandas as pd import matplotlib.pyplot as plt -import scipy.stats as scs import scipy.special as ssp from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split import pystan +import gc plt.switch_backend('agg') +plt.rcParams.update({'font.size': 16}) +plt.rcParams.update({'figure.figsize': (10, 6)}) + import sys # figure storage name -figure_path = sys.argv[1] - -# Iterations -nIter = int(sys.argv[2]) +save_name = sys.argv[1] + "sl_compas" # How many groups if jung model is used -group_amount = int(sys.argv[3]) +group_amount = int(sys.argv[2]) # Name of stan model code file -stan_code_file_name = sys.argv[4] +stan_code_file_name = sys.argv[3] # Variance prior -sigma_tau = float(sys.argv[5]) +sigma_tau = float(sys.argv[4]) # figure storage name -data_path = sys.argv[6] +data_path = sys.argv[5] # Prefix for the figures and log files print("These results have been obtained with the following settings:") -print("Number of train test splits :", nIter) - print("Number of groups:", group_amount) print("Prior for the variances:", sigma_tau) -save_name = "sl_compas" - def inv_logit(x): return 1.0 / (1.0 + np.exp(-1.0 * x)) @@ -76,6 +69,11 @@ def inverse_cumulative(x, mu, sigma): return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) +def standardError(p, n): + denominator = p * (1 - p) + + return np.sqrt(denominator / n) + ########################## # ## Evaluator modules @@ -207,6 +205,26 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, # Subset. "D_q is the set of all observations judged by q." D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() + ##### Agreement rate computation begins ####### + + max_leniency = max(df[accRateR_col]) + + # Sort D_q + # "Observations deemed as high risk by B are at the top of this list" + D_sort_q = D_q.sort_values(by=modelProbS_col, ascending=False) + + # The agreement rate is now computed as the percentage of all the positive + # decisions given by q in the 1-r % of the most dangerous subjects. + all_negatives = np.sum(D_sort_q[decisionT_col] == 0) + + n_most_dangerous = int(round(D_q.shape[0] * (1-max_leniency))) + + agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous]) + + print("The agreement rate of contraction is:", agreed_decisions/all_negatives) + + ##### Agreement rate computation ends ####### + # All observations of R_q have observed outcome labels. # "R_q is the set of observations in D_q with observed outcome labels." R_q = D_q[D_q[decisionT_col] == 1].copy() @@ -222,109 +240,42 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, # "R_B is the list of observations assigned to t = 1 by B" R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]] - return np.sum(R_B[resultY_col] == 0) / D_q.shape[0] + return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0] # ### Evaluators -def contractionEvaluator(df, featureX_col, judgeIDJ_col, decisionT_col, - resultY_col, accRateR_col, r): - - train, test = train_test_split(df, test_size=0.5) - - B_model, predictions = fitPredictiveModel( - train.loc[train[decisionT_col] == 1, featureX_col], - train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], - 0) - - test = test.assign(B_prob_0_model=predictions) - - # Invoke the original contraction. - FR = contraction(test, - judgeIDJ_col=judgeIDJ_col, - decisionT_col=decisionT_col, - resultY_col=resultY_col, - modelProbS_col="B_prob_0_model", - accRateR_col=accRateR_col, - r=r) - - return FR - +def trueEvaluationEvaluator(df, resultY_col, r): -def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r, - fit_model=True): + df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - train, test = train_test_split(df, test_size=0.5) - - if fit_model: - B_model, predictions = fitPredictiveModel(train[featureX_col], - train[resultY_col], - test[featureX_col], 0) + to_release = int(round(df.shape[0] * r)) - test = test.assign(B_prob_0_model=predictions) - - test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(test.shape[0] * r)) - - return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0] + failed = df[resultY_col][0:to_release] == 0 + return np.sum(failed) / df.shape[0], df.shape[0] -def labeledOutcomesEvaluator(df, - featureX_col, - decisionT_col, - resultY_col, - r, - adjusted=False, - fit_model=True): - train, test = train_test_split(df, test_size=0.5) +def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False): - if fit_model: - B_model, predictions = fitPredictiveModel( - train.loc[train[decisionT_col] == 1, featureX_col], - train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], - 0) + df_observed = df.loc[df[decisionT_col] == 1, :] - test = test.assign(B_prob_0_model=predictions) - - test_observed = test.loc[test[decisionT_col] == 1, :] - - test_observed = test_observed.sort_values(by='B_prob_0_model', + df_observed = df_observed.sort_values(by='B_prob_0_model', inplace=False, ascending=True) - to_release = int(round(test_observed.shape[0] * r)) + to_release = int(round(df_observed.shape[0] * r)) + + failed = df_observed[resultY_col][0:to_release] == 0 if adjusted: - return np.mean(test_observed[resultY_col][0:to_release] == 0) + return np.mean(failed), df.shape[0] - return np.sum( - test_observed[resultY_col][0:to_release] == 0) / test.shape[0] + return np.sum(failed) / df.shape[0], df.shape[0] -def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, - accRateR_col, r): +################### Data preprocessing ###################### - # Get judges with correct leniency as list - is_correct_leniency = df[accRateR_col].round(1) == r - - # No judges with correct leniency - if np.sum(is_correct_leniency) == 0: - return np.nan - - correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col] - - # Released are the people they judged and released, T = 1 - released = df[df[judgeIDJ_col].isin(correct_leniency_list) - & (df[decisionT_col] == 1)] - - # Get their failure rate, aka ratio of reoffenders to number of people judged in total - return np.sum(released[resultY_col] == 0) / correct_leniency_list.shape[0] - - -################### - # Read in the data compas_raw = pd.read_csv(data_path) @@ -348,7 +299,7 @@ compas = compas[compas.score_text.notnull()] # So here result_Y = 1 - is_recid (inverted binary coding). compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1) -# Convert string values to dummies, drop first so full rank +# Convert string values to dummies, drop first to keep full rank. compas_dummy = pd.get_dummies( compas, columns=['c_charge_degree', 'race', 'age_cat', 'sex'], @@ -363,13 +314,13 @@ compas_dummy.drop(columns=[ # Shuffle rows for random judge assignment compas_shuffled = compas_dummy.sample(frac=1) -nJudges_M = 9 +nJudges_M = 10 # Assign judges as evenly as possible judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False) # Assign fixed leniencies from 0.1 to 0.9 -judge_leniency = np.arange(1, 10) / 10 +judge_leniency = np.array([0.1, 0.3, 0.5, 0.7, 0.9, 0.1, 0.3, 0.5, 0.7, 0.9]) judge_leniency = judge_leniency[judge_ID] @@ -412,162 +363,187 @@ feature_cols = compas_labeled.columns[feature_cols] ####### Draw figures ########### -failure_rates = np.zeros((8, 6)) -failure_sems = np.zeros((8, 6)) - -f_rate_true = np.zeros((nIter, 8)) -f_rate_label = np.zeros((nIter, 8)) -f_rate_label_adj = np.zeros((nIter, 8)) -f_rate_human = np.zeros((nIter, 8)) -f_rate_cont = np.zeros((nIter, 8)) -f_rate_caus = np.zeros((nIter, 8)) - - -# Split data -train, test = train_test_split(compas_labeled, test_size=0.5) - -# Train a logistic regression model -B_model, predictions = fitPredictiveModel( - train.loc[train['decision_T'] == 1, feature_cols], - train.loc[train['decision_T'] == 1, 'result_Y'], test[feature_cols], 0) - -test = test.assign(B_prob_0_model=predictions) - -test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) +def drawDiagnostics(title, save_name, f_rates, titles): + + cols = 2 + rows = np.ceil(len(f_rates) / cols) + + plt.figure(figsize=(16, 4.5*rows+1)) -kk_array = pd.qcut(test['B_prob_0_model'], group_amount, labels=False) + ax = plt.subplot(rows, cols, 1) + x_ax = np.arange(1, 9, 1) / 10 -# Find observed values -observed = test['decision_T'] == 1 + plt.boxplot(f_rates[0], labels=x_ax) -# Assign data to the model -dat = dict(D=1, - N_obs=np.sum(observed), - N_cens=np.sum(~observed), - K=group_amount, - sigma_tau=sigma_tau, - M=len(set(compas_labeled['judge_ID'])), - jj_obs=test.loc[observed, 'judge_ID']+1, - jj_cens=test.loc[~observed, 'judge_ID']+1, - kk_obs=kk_array[observed]+1, - kk_cens=kk_array[~observed]+1, - dec_obs=test.loc[observed, 'decision_T'], - dec_cens=test.loc[~observed, 'decision_T'], - X_obs=test.loc[observed, 'B_prob_0_model'].values.reshape(-1,1), - X_cens=test.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1), - y_obs=test.loc[observed, 'result_Y'].astype(int)) + plt.title(titles[0]) + plt.xlabel('Acceptance rate') + plt.ylabel('Failure rate') + plt.grid() -sm = pystan.StanModel(file=stan_code_file_name) + for i in range(len(f_rates)): + plt.subplot(rows, cols, i + 1, sharey=ax) -fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9)) + plt.boxplot(f_rates[i], labels=x_ax) -pars = fit.extract() + plt.title(titles[i]) + plt.xlabel('Acceptance rate') + plt.ylabel('Failure rate') + plt.grid() -print(fit, file=open(save_name + '_stan_fit_diagnostics.txt', 'w')) + plt.tight_layout() + plt.subplots_adjust(top=0.85) + plt.suptitle(title, y=0.96, weight='bold') -plt.figure(figsize=(15,30)) + plt.savefig(save_name + '_diagnostic_plot') -fit.plot(); + plt.show() -plt.savefig(save_name + '_stan_diagnostic_plot') +nIter = 1 -plt.show() -plt.close('all') +failure_rates = np.zeros((8, 4)) +failure_stds = np.zeros((8, 4)) -# Bayes +f_rate_true = np.zeros((nIter, 8)) +f_rate_label = np.zeros((nIter, 8)) +f_rate_cont = np.zeros((nIter, 8)) +f_rate_cf = np.zeros((nIter, 8)) -# Alusta matriisi, rivillä yksi otos posteriorista -# sarakkeet havaintoja -y_imp = np.ones((pars['y_est'].shape[0], test.shape[0])) -# Täydennetään havaitsemattomat estimoiduilla -y_imp[:, ~observed] = 1-pars['y_est'] +for i in range(nIter): -# Täydennetään havaitut havaituilla -y_imp[:, observed] = 1-test.loc[observed, 'result_Y'] + # Split data by the judges + train = compas_labeled[compas_labeled.judge_ID <=4] + test_labeled = compas_labeled[compas_labeled.judge_ID >4] + + # Assign same observations to unlabeled data + test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values] + + + # Train a logistic regression model + B_model, predictions = fitPredictiveModel( + train.loc[train['decision_T'] == 1, feature_cols], + train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0) + + # Attach predictions to data + test_labeled = test_labeled.assign(B_prob_0_model=predictions) + test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions) + + test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True) + + kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False) + + # Find observed values + observed = test_labeled['decision_T'] == 1 + + # Assign data to the model + dat = dict(D=10, + N_obs=np.sum(observed), + N_cens=np.sum(~observed), + K=group_amount, + sigma_tau=sigma_tau, + M=len(set(compas_labeled['judge_ID'])), + jj_obs=test_labeled.loc[observed, 'judge_ID']+1, + jj_cens=test_labeled.loc[~observed, 'judge_ID']+1, + kk_obs=kk_array[observed]+1, + kk_cens=kk_array[~observed]+1, + dec_obs=test_labeled.loc[observed, 'decision_T'], + dec_cens=test_labeled.loc[~observed, 'decision_T'], + X_obs=test_labeled.loc[observed, feature_cols.values], + X_cens=test_labeled.loc[~observed, feature_cols.values], + y_obs=test_labeled.loc[observed, 'result_Y'].astype(int)) + + sm = pystan.StanModel(file=stan_code_file_name) + + # Do the sampling + fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.95)) + + pars = fit.extract() + + plt.figure(figsize=(15,30)) -Rs = np.arange(.1, .9, .1) + fit.plot(); -to_release_list = np.round(test.shape[0] * Rs).astype(int) + plt.savefig(save_name + '_stan_convergence_diagnostic_plot_' + str(i)) + + plt.show() + plt.close('all') + gc.collect() -f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan) + print(fit, file=open(save_name + '_stan_fit_diagnostics_' + str(i) + '.txt', 'w')) -for i in range(len(to_release_list)): - est_failure_rates = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) / test.shape[0] + # Bayes + + # Format matrix, each row is a sample of the posterior + # columns are observations + + y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0])) + + # Impute the unobserved with the estimated + # Revers the binary coding to compute failures as sum 0 == 0 = 1 + y_imp[:, ~observed] = 1 - pars['y_est'] + + # Remove the variables to conserve memory + del(pars) + del(fit) - f_rate_bayes[:, i] = est_failure_rates - failure_rates[i, 5] = np.mean(est_failure_rates) + # Impute the observed + y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y'] + + for r in range(1, 9): -for i in range(nIter): + to_release = np.round(test_labeled.shape[0] * (r / 10)).astype(int) + + failed = np.sum(y_imp[:, 0:to_release], axis=1) + + f_rate_cf[i, r - 1] = np.mean(failed / test_labeled.shape[0]) - print(" [", i, "] ", sep='', end="") - - for r in np.arange(1, 9): - print(".", end="") - + # True evaluation - - f_rate_true[i, r - 1] = trueEvaluationEvaluator( - compas_unlabeled, feature_cols, 'decision_T', 'result_Y', r / 10) - + + f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled, + 'result_Y', r / 10) + # Labeled outcomes only - - f_rate_label[i, r - 1] = labeledOutcomesEvaluator( - compas_labeled, feature_cols, 'decision_T', 'result_Y', r / 10) - - # Adjusted labeled outcomes - - f_rate_label_adj[i, r - 1] = labeledOutcomesEvaluator( - compas_labeled, - feature_cols, - 'decision_T', - 'result_Y', - r / 10, - adjusted=True) - - # Human evaluation - - f_rate_human[i, r - 1] = humanEvaluationEvaluator( - compas_labeled, 'judge_ID', 'decision_T', 'result_Y', - 'judge_leniency', r / 10) - + + f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled, + 'decision_T', 'result_Y', r / 10) + # Contraction - - f_rate_cont[i, r - 1] = contractionEvaluator( - compas_labeled, feature_cols, 'judge_ID', 'decision_T', 'result_Y', - 'judge_leniency', r / 10) - + + f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judge_ID', + 'decision_T', 'result_Y', 'B_prob_0_model', + 'judge_leniency', r / 10) + failure_rates[:, 0] = np.mean(f_rate_true, axis=0) failure_rates[:, 1] = np.mean(f_rate_label, axis=0) -failure_rates[:, 2] = np.mean(f_rate_label_adj, axis=0) -failure_rates[:, 3] = np.mean(f_rate_human, axis=0) -failure_rates[:, 4] = np.mean(f_rate_cont, axis=0) +failure_rates[:, 2] = np.mean(f_rate_cont, axis=0) +failure_rates[:, 3] = np.mean(f_rate_cf, axis=0) -failure_sems[:, 0] = scs.sem(f_rate_true, axis=0) -failure_sems[:, 1] = scs.sem(f_rate_label, axis=0) -failure_sems[:, 2] = scs.sem(f_rate_label_adj, axis=0) -failure_sems[:, 3] = scs.sem(f_rate_human, axis=0) -failure_sems[:, 4] = scs.sem(f_rate_cont, axis=0) -failure_sems[:, 5] = scs.sem(f_rate_bayes, axis=0, nan_policy='omit') +failure_stds[:, 0] = np.std(f_rate_true, axis=0) +failure_stds[:, 1] = np.std(f_rate_label, axis=0) +failure_stds[:, 2] = np.std(f_rate_cont, axis=0) +failure_stds[:, 3] = np.std(f_rate_cf, axis=0) x_ax = np.arange(0.1, 0.9, 0.1) -labels = [ - 'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.', - 'Human evaluation', 'Contraction', 'Potential outcomes' -] -colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c'] +labels = ['True Evaluation', 'Labeled outcomes', 'Contraction', + 'Counterfactuals'] + +colours = ['g', 'magenta', 'b', 'r'] + +line_styles = ['--', ':', '-.', '-'] for i in range(failure_rates.shape[1]): plt.errorbar(x_ax, failure_rates[:, i], label=labels[i], c=colours[i], - yerr=failure_sems[:, i]) + linestyle=line_styles[i], + yerr=failure_stds[:, i]) -plt.title('Failure rate vs. Acceptance rate') +#plt.title('FR vs. AR') plt.xlabel('Acceptance rate') plt.ylabel('Failure rate') plt.legend() @@ -580,50 +556,18 @@ plt.show() print("\nFailure rates:") print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) +print("\nStandard deviations:") +print(np.array2string(failure_stds, formatter={'float_kind':lambda x: "%.5f" % x})) + + print("\nMean absolute errors:") for i in range(1, failure_rates.shape[1]): print( labels[i].ljust(len(max(labels, key=len))), np.round( np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5)) - - -# Draw diagnostic figures -f_rates= [f_rate_true, f_rate_label, f_rate_label_adj, f_rate_human, - f_rate_cont, f_rate_bayes] - -cols = 2 -rows = np.ceil(len(f_rates) / cols) - -plt.figure(figsize=(16, 4.5*rows+1)) - -ax = plt.subplot(rows, cols, 1) -x_ax = np.arange(1, 9, 1) / 10 - -plt.boxplot(f_rates[0], labels=x_ax) - -plt.title(labels[0]) -plt.xlabel('Acceptance rate') -plt.ylabel('Failure rate') -plt.grid() - -for i in range(len(f_rates)): - plt.subplot(rows, cols, i + 1, sharey=ax) - - plt.boxplot(f_rates[i], labels=x_ax) - - plt.title(labels[i]) - plt.xlabel('Acceptance rate') - plt.ylabel('Failure rate') - plt.grid() - -plt.tight_layout() -plt.subplots_adjust(top=0.89) - -title = "COMPAS data set\nFluctuation of failure rate estimates per method" - -plt.suptitle(title, y=0.96, weight='bold') - -plt.savefig(save_name + '_diagnostic_plot') -plt.show() \ No newline at end of file +drawDiagnostics(title="Variation of failure rate estimates by method\nCOMPAS data set", + save_name=save_name, + f_rates=[f_rate_true, f_rate_label, f_rate_cont, f_rate_cf], + titles=labels) \ No newline at end of file diff --git a/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py b/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py deleted file mode 100644 index 18eb1315f27bc7f53fd8310338f2c593ee1334c8..0000000000000000000000000000000000000000 --- a/analysis_and_scripts/stan_modelling_empirical_SEfixed_usefeatures.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -# Author: Riku Laine -# Date: 26JUL2019 -# Project name: Potential outcomes in model evaluation -# Description: This script creates the figures and results used -# in empirical data experiments. -# -# Parameters: -# ----------- -# (1) figure_path : file name for saving the created figures. -# (2) group_amount : How many groups if Jung-inspired model is used. -# (3) stan_code_file_name : Name of file containing the stan model code. -# (4) sigma_tau : Values of prior variance for the Jung-inspired model. -# (5) data_path : File of compas data. - -""" - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import scipy.special as ssp -from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split -import pystan - -plt.switch_backend('agg') - -import sys - -# figure storage name -figure_path = sys.argv[1] - -# How many groups if jung model is used -group_amount = int(sys.argv[2]) - -# Name of stan model code file -stan_code_file_name = sys.argv[3] - -# Variance prior -sigma_tau = float(sys.argv[4]) - -# figure storage name -data_path = sys.argv[5] - -# Prefix for the figures and log files - -print("These results have been obtained with the following settings:") - -print("Number of groups:", group_amount) - -print("Prior for the variances:", sigma_tau) - -save_name = "sl_compas" - -def inv_logit(x): - return 1.0 / (1.0 + np.exp(-1.0 * x)) - - -def logit(x): - return np.log(x) - np.log(1.0 - x) - - -def inverse_cumulative(x, mu, sigma): - '''Compute the inverse of the cumulative distribution of logit-normal - distribution at x with parameters mu and sigma (mean and st.dev.).''' - - return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) - -def standardError(p, n): - denominator = p * (1 - p) - - return np.sqrt(denominator / n) - -########################## - -# ## Evaluator modules - -# ### Convenience functions - -def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None): - ''' - Fit a predictive model (default logistic regression) with given training - instances and return probabilities for test instances to obtain a given - class label. - - Arguments: - ---------- - - x_train -- x values of training instances - y_train -- y values of training instances - x_test -- x values of test instances - class_value -- class label for which the probabilities are counted for. - model_type -- type of model to be fitted. - - Returns: - -------- - (1) Trained predictive model - (2) Probabilities for given test inputs for given class. - ''' - - if model_type is None or model_type in ["logistic_regression", "lr"]: - # Instantiate the model (using the default parameters) - logreg = LogisticRegression(solver='lbfgs') - - # Check shape and fit the model. - if x_train.ndim == 1: - logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train) - else: - logreg = logreg.fit(x_train, y_train) - - label_probs_logreg = getProbabilityForClass(x_test, logreg, - class_value) - - return logreg, label_probs_logreg - - elif model_type in ["random_forest", "rf"]: - # Instantiate the model - forest = RandomForestClassifier(n_estimators=100, max_depth=3) - - # Check shape and fit the model. - if x_train.ndim == 1: - forest = forest.fit(x_train.values.reshape(-1, 1), y_train) - else: - forest = forest.fit(x_train, y_train) - - label_probs_forest = getProbabilityForClass(x_test, forest, - class_value) - - return forest, label_probs_forest - - elif model_type == "fully_random": - - label_probs = np.ones_like(x_test) / 2 - - model_object = lambda x: 0.5 - - return model_object, label_probs - else: - raise ValueError("Invalid model_type!", model_type) - - -def getProbabilityForClass(x, model, class_value): - ''' - Function (wrapper) for obtaining the probability of a class given x and a - predictive model. - - Arguments: - ----------- - x -- individual features, an array of shape (observations, features) - model -- a trained sklearn model. Predicts probabilities for given x. - Should accept input of shape (observations, features) - class_value -- the resulting class to predict (usually 0 or 1). - - Returns: - -------- - (1) The probabilities of given class label for each x. - ''' - if x.ndim == 1: - # if x is vector, transform to column matrix. - f_values = model.predict_proba(np.array(x).reshape(-1, 1)) - else: - f_values = model.predict_proba(x) - - # Get correct column of predicted class, remove extra dimensions and return. - return f_values[:, model.classes_ == class_value].flatten() - - -# ### Contraction algorithm -# -# Below is an implementation of Lakkaraju's team's algorithm presented in -# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant -# parameters to be passed to the function are presented in the description. - -def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, - accRateR_col, r): - ''' - This is an implementation of the algorithm presented by Lakkaraju - et al. in their paper "The Selective Labels Problem: Evaluating - Algorithmic Predictions in the Presence of Unobservables" (2017). - - Arguments: - ---------- - df -- The (Pandas) data frame containing the data, judge decisions, - judge IDs, results and probability scores. - judgeIDJ_col -- String, the name of the column containing the judges' IDs - in df. - decisionT_col -- String, the name of the column containing the judges' decisions - resultY_col -- String, the name of the column containing the realization - modelProbS_col -- String, the name of the column containing the probability - scores from the black-box model B. - accRateR_col -- String, the name of the column containing the judges' - acceptance rates - r -- Float between 0 and 1, the given acceptance rate. - - Returns: - -------- - (1) The estimated failure rate at acceptance rate r. - ''' - # Get ID of the most lenient judge. - most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()] - - # Subset. "D_q is the set of all observations judged by q." - D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() - - # All observations of R_q have observed outcome labels. - # "R_q is the set of observations in D_q with observed outcome labels." - R_q = D_q[D_q[decisionT_col] == 1].copy() - - # Sort observations in R_q in descending order of confidence scores S and - # assign to R_sort_q. - # "Observations deemed as high risk by B are at the top of this list" - R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False) - - number_to_remove = int( - round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0]))) - - # "R_B is the list of observations assigned to t = 1 by B" - R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]] - - return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0] - - -# ### Evaluators - -def trueEvaluationEvaluator(df, resultY_col, r): - - df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(df.shape[0] * r)) - - failed = df[resultY_col][0:to_release] == 0 - - return np.sum(failed) / df.shape[0], df.shape[0] - - -def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False): - - df_observed = df.loc[df[decisionT_col] == 1, :] - - df_observed = df_observed.sort_values(by='B_prob_0_model', - inplace=False, - ascending=True) - - to_release = int(round(df_observed.shape[0] * r)) - - failed = df_observed[resultY_col][0:to_release] == 0 - - if adjusted: - return np.mean(failed), df.shape[0] - - return np.sum(failed) / df.shape[0], df.shape[0] - - -def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, - accRateR_col, r): - - # Get judges with correct leniency as list - is_correct_leniency = df[accRateR_col].round(1) == r - - # No judges with correct leniency - if np.sum(is_correct_leniency) == 0: - return np.nan, np.nan - - correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col] - - # Released are the people they judged and released, T = 1 - released = df[df[judgeIDJ_col].isin(correct_leniency_list) - & (df[decisionT_col] == 1)] - - failed = released[resultY_col] == 0 - - # Get their failure rate, aka ratio of reoffenders to number of people judged in total - return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0] - -################### - -# Read in the data -compas_raw = pd.read_csv(data_path) - -# Select columns -compas = compas_raw[[ - 'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', - 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', - 'two_year_recid', 'c_jail_in', 'c_jail_out' -]] - -# Subset values, see reasons in ProPublica methodology. -compas = compas.query('days_b_screening_arrest <= 30 and \ - days_b_screening_arrest >= -30 and \ - is_recid != -1 and \ - c_charge_degree != "O"') - -# Drop row if score_text is na -compas = compas[compas.score_text.notnull()] - -# Recode recidivism values to fit earlier notation -# So here result_Y = 1 - is_recid (inverted binary coding). -compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1) - -# Convert string values to dummies, drop first so full rank -compas_dummy = pd.get_dummies( - compas, - columns=['c_charge_degree', 'race', 'age_cat', 'sex'], - drop_first=True) - -compas_dummy.drop(columns=[ - 'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', - 'two_year_recid', 'score_text', 'is_recid' -], - inplace=True) - -# Shuffle rows for random judge assignment -compas_shuffled = compas_dummy.sample(frac=1) - -nJudges_M = 9 - -# Assign judges as evenly as possible -judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False) - -# Assign fixed leniencies from 0.1 to 0.9 -judge_leniency = np.arange(1, 10) / 10 - -judge_leniency = judge_leniency[judge_ID] - -compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID, - judge_leniency=judge_leniency) - -# Sort by judges then probabilities in decreasing order -# Most dangerous for each judge are at the top. -compas_shuffled.sort_values(by=["judge_ID", "decile_score"], - ascending=False, - inplace=True) - -# Iterate over the data. Subject will be given a negative decision -# if they are in the top (1-r)*100% of the individuals the judge will judge. -# I.e. if their within-judge-index is under 1 - acceptance threshold times -# the number of subjects assigned to each judge they will receive a -# negative decision. -compas_shuffled.reset_index(drop=True, inplace=True) - -subjects_allocated = compas_shuffled.judge_ID.value_counts() - -compas_shuffled['judge_index'] = compas_shuffled.groupby('judge_ID').cumcount() - -compas_shuffled['decision_T'] = np.where( - compas_shuffled['judge_index'] < (1 - compas_shuffled['judge_leniency']) * - subjects_allocated[compas_shuffled['judge_ID']].values, 0, 1) - -compas_labeled = compas_shuffled.copy() -compas_unlabeled = compas_shuffled.copy() - -# Hide unobserved -compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan - -# Choose feature_columns -feature_cols = ~compas_labeled.columns.isin( - ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T']) - -feature_cols = compas_labeled.columns[feature_cols] - - -####### Draw figures ########### - -failure_rates = np.zeros((8, 6)) -error_Ns = np.zeros((8, 6)) - -# Split data -train, test_labeled = train_test_split(compas_labeled, test_size=0.5) - -# Assign same observations to unlabeled data -test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values] - - -# Train a logistic regression model -B_model, predictions = fitPredictiveModel( - train.loc[train['decision_T'] == 1, feature_cols], - train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0) - -test_labeled = test_labeled.assign(B_prob_0_model=predictions) -test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions) - -test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - -kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False) - -# Find observed values -observed = test_labeled['decision_T'] == 1 - -# Assign data to the model -dat = dict(D=10, - N_obs=np.sum(observed), - N_cens=np.sum(~observed), - K=group_amount, - sigma_tau=sigma_tau, - M=len(set(compas_labeled['judge_ID'])), - jj_obs=test_labeled.loc[observed, 'judge_ID']+1, - jj_cens=test_labeled.loc[~observed, 'judge_ID']+1, - kk_obs=kk_array[observed]+1, - kk_cens=kk_array[~observed]+1, - dec_obs=test_labeled.loc[observed, 'decision_T'], - dec_cens=test_labeled.loc[~observed, 'decision_T'], - X_obs=test_labeled.loc[observed, feature_cols.values], - X_cens=test_labeled.loc[~observed, feature_cols.values], - y_obs=test_labeled.loc[observed, 'result_Y'].astype(int)) - -sm = pystan.StanModel(file=stan_code_file_name) - -fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9)) - -pars = fit.extract() - -print(fit, file=open(save_name + '_stan_fit_diagnostics.txt', 'w')) - -plt.figure(figsize=(15,30)) - -fit.plot(); - -plt.savefig(save_name + '_stan_diagnostic_plot') - -plt.show() -plt.close('all') - -# Bayes - -# Alusta matriisi, rivillä yksi otos posteriorista -# sarakkeet havaintoja -y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0])) - -# Täydennetään havaitsemattomat estimoiduilla -y_imp[:, ~observed] = 1-pars['y_est'] - -# Täydennetään havaitut havaituilla -y_imp[:, observed] = 1-test_labeled.loc[observed, 'result_Y'] - -Rs = np.arange(.1, .9, .1) - -to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int) - -f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan) - -for i in range(len(to_release_list)): - - failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) - - est_failure_rates = failed / test_labeled.shape[0] - - failure_rates[i, 5] = np.mean(est_failure_rates) - - error_Ns[i, 5] = test_labeled.shape[0] - -for r in np.arange(1, 9): - - print(".", end="") - - # True evaluation - - FR, N = trueEvaluationEvaluator(test_unlabeled, 'result_Y', r / 10) - - failure_rates[r - 1, 0] = FR - error_Ns[r - 1, 0] = N - - # Labeled outcomes only - - FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10) - - failure_rates[r - 1, 1] = FR - error_Ns[r - 1, 1] = N - - # Adjusted labeled outcomes - - FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10, - adjusted=True) - - failure_rates[r - 1, 2] = FR - error_Ns[r - 1, 2] = N - - # Human evaluation - - FR, N = humanEvaluationEvaluator(test_labeled, 'judge_ID', 'decision_T', - 'result_Y', 'judge_leniency', - r / 10) - - failure_rates[r - 1, 3] = FR - error_Ns[r - 1, 3] = N - - # Contraction - - FR, N = contraction(test_labeled, 'judge_ID', 'decision_T', 'result_Y', - 'B_prob_0_model', 'judge_leniency', r / 10) - - failure_rates[r - 1, 4] = FR - error_Ns[r - 1, 4] = N - -x_ax = np.arange(0.1, 0.9, 0.1) - -failure_SEs = standardError(failure_rates, error_Ns) - -labels = [ - 'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.', - 'Human evaluation', 'Contraction', 'Potential outcomes' -] -colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c'] - -for i in range(failure_rates.shape[1]): - plt.errorbar(x_ax, - failure_rates[:, i], - label=labels[i], - c=colours[i], - yerr=failure_SEs[:, i]) - -plt.title('Failure rate vs. Acceptance rate') -plt.xlabel('Acceptance rate') -plt.ylabel('Failure rate') -plt.legend() -plt.grid() - -plt.savefig(save_name + '_all') - -plt.show() - -print("\nFailure rates:") -print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) - -print("\nMean absolute errors:") -for i in range(1, failure_rates.shape[1]): - print( - labels[i].ljust(len(max(labels, key=len))), - np.round( - np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5)) \ No newline at end of file diff --git a/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py b/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py deleted file mode 100644 index 49e23f2285d535638f29ba9e74aba4bc67fa9dcd..0000000000000000000000000000000000000000 --- a/analysis_and_scripts/stan_modelling_empirical_SEfixed_useprediction.py +++ /dev/null @@ -1,532 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -# Author: Riku Laine -# Date: 26JUL2019 -# Project name: Potential outcomes in model evaluation -# Description: This script creates the figures and results used -# in empirical data experiments. -# -# Parameters: -# ----------- -# (1) figure_path : file name for saving the created figures. -# (2) group_amount : How many groups if Jung-inspired model is used. -# (3) stan_code_file_name : Name of file containing the stan model code. -# (4) sigma_tau : Values of prior variance for the Jung-inspired model. -# (5) data_path : File of compas data. - -""" - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import scipy.special as ssp -from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split -import pystan - -plt.switch_backend('agg') - -import sys - -# figure storage name -figure_path = sys.argv[1] - -# How many groups if jung model is used -group_amount = int(sys.argv[2]) - -# Name of stan model code file -stan_code_file_name = sys.argv[3] - -# Variance prior -sigma_tau = float(sys.argv[4]) - -# figure storage name -data_path = sys.argv[5] - -# Prefix for the figures and log files - -print("These results have been obtained with the following settings:") - -print("Number of groups:", group_amount) - -print("Prior for the variances:", sigma_tau) - -save_name = "sl_compas" - -def inv_logit(x): - return 1.0 / (1.0 + np.exp(-1.0 * x)) - - -def logit(x): - return np.log(x) - np.log(1.0 - x) - - -def inverse_cumulative(x, mu, sigma): - '''Compute the inverse of the cumulative distribution of logit-normal - distribution at x with parameters mu and sigma (mean and st.dev.).''' - - return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) - -def standardError(p, n): - denominator = p * (1 - p) - - return np.sqrt(denominator / n) - -########################## - -# ## Evaluator modules - -# ### Convenience functions - -def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None): - ''' - Fit a predictive model (default logistic regression) with given training - instances and return probabilities for test instances to obtain a given - class label. - - Arguments: - ---------- - - x_train -- x values of training instances - y_train -- y values of training instances - x_test -- x values of test instances - class_value -- class label for which the probabilities are counted for. - model_type -- type of model to be fitted. - - Returns: - -------- - (1) Trained predictive model - (2) Probabilities for given test inputs for given class. - ''' - - if model_type is None or model_type in ["logistic_regression", "lr"]: - # Instantiate the model (using the default parameters) - logreg = LogisticRegression(solver='lbfgs') - - # Check shape and fit the model. - if x_train.ndim == 1: - logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train) - else: - logreg = logreg.fit(x_train, y_train) - - label_probs_logreg = getProbabilityForClass(x_test, logreg, - class_value) - - return logreg, label_probs_logreg - - elif model_type in ["random_forest", "rf"]: - # Instantiate the model - forest = RandomForestClassifier(n_estimators=100, max_depth=3) - - # Check shape and fit the model. - if x_train.ndim == 1: - forest = forest.fit(x_train.values.reshape(-1, 1), y_train) - else: - forest = forest.fit(x_train, y_train) - - label_probs_forest = getProbabilityForClass(x_test, forest, - class_value) - - return forest, label_probs_forest - - elif model_type == "fully_random": - - label_probs = np.ones_like(x_test) / 2 - - model_object = lambda x: 0.5 - - return model_object, label_probs - else: - raise ValueError("Invalid model_type!", model_type) - - -def getProbabilityForClass(x, model, class_value): - ''' - Function (wrapper) for obtaining the probability of a class given x and a - predictive model. - - Arguments: - ----------- - x -- individual features, an array of shape (observations, features) - model -- a trained sklearn model. Predicts probabilities for given x. - Should accept input of shape (observations, features) - class_value -- the resulting class to predict (usually 0 or 1). - - Returns: - -------- - (1) The probabilities of given class label for each x. - ''' - if x.ndim == 1: - # if x is vector, transform to column matrix. - f_values = model.predict_proba(np.array(x).reshape(-1, 1)) - else: - f_values = model.predict_proba(x) - - # Get correct column of predicted class, remove extra dimensions and return. - return f_values[:, model.classes_ == class_value].flatten() - - -# ### Contraction algorithm -# -# Below is an implementation of Lakkaraju's team's algorithm presented in -# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant -# parameters to be passed to the function are presented in the description. - -def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, - accRateR_col, r): - ''' - This is an implementation of the algorithm presented by Lakkaraju - et al. in their paper "The Selective Labels Problem: Evaluating - Algorithmic Predictions in the Presence of Unobservables" (2017). - - Arguments: - ---------- - df -- The (Pandas) data frame containing the data, judge decisions, - judge IDs, results and probability scores. - judgeIDJ_col -- String, the name of the column containing the judges' IDs - in df. - decisionT_col -- String, the name of the column containing the judges' decisions - resultY_col -- String, the name of the column containing the realization - modelProbS_col -- String, the name of the column containing the probability - scores from the black-box model B. - accRateR_col -- String, the name of the column containing the judges' - acceptance rates - r -- Float between 0 and 1, the given acceptance rate. - - Returns: - -------- - (1) The estimated failure rate at acceptance rate r. - ''' - # Get ID of the most lenient judge. - most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()] - - # Subset. "D_q is the set of all observations judged by q." - D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() - - # All observations of R_q have observed outcome labels. - # "R_q is the set of observations in D_q with observed outcome labels." - R_q = D_q[D_q[decisionT_col] == 1].copy() - - # Sort observations in R_q in descending order of confidence scores S and - # assign to R_sort_q. - # "Observations deemed as high risk by B are at the top of this list" - R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False) - - number_to_remove = int( - round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0]))) - - # "R_B is the list of observations assigned to t = 1 by B" - R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]] - - return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0] - - -# ### Evaluators - -def trueEvaluationEvaluator(df, resultY_col, r): - - df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(df.shape[0] * r)) - - failed = df[resultY_col][0:to_release] == 0 - - return np.sum(failed) / df.shape[0], df.shape[0] - - -def labeledOutcomesEvaluator(df, decisionT_col, resultY_col, r, adjusted=False): - - df_observed = df.loc[df[decisionT_col] == 1, :] - - df_observed = df_observed.sort_values(by='B_prob_0_model', - inplace=False, - ascending=True) - - to_release = int(round(df_observed.shape[0] * r)) - - failed = df_observed[resultY_col][0:to_release] == 0 - - if adjusted: - return np.mean(failed), df.shape[0] - - return np.sum(failed) / df.shape[0], df.shape[0] - - -def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, - accRateR_col, r): - - # Get judges with correct leniency as list - is_correct_leniency = df[accRateR_col].round(1) == r - - # No judges with correct leniency - if np.sum(is_correct_leniency) == 0: - return np.nan, np.nan - - correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col] - - # Released are the people they judged and released, T = 1 - released = df[df[judgeIDJ_col].isin(correct_leniency_list) - & (df[decisionT_col] == 1)] - - failed = released[resultY_col] == 0 - - # Get their failure rate, aka ratio of reoffenders to number of people judged in total - return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0] - -################### - -# Read in the data -compas_raw = pd.read_csv(data_path) - -# Select columns -compas = compas_raw[[ - 'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', - 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', - 'two_year_recid', 'c_jail_in', 'c_jail_out' -]] - -# Subset values, see reasons in ProPublica methodology. -compas = compas.query('days_b_screening_arrest <= 30 and \ - days_b_screening_arrest >= -30 and \ - is_recid != -1 and \ - c_charge_degree != "O"') - -# Drop row if score_text is na -compas = compas[compas.score_text.notnull()] - -# Recode recidivism values to fit earlier notation -# So here result_Y = 1 - is_recid (inverted binary coding). -compas['result_Y'] = np.where(compas['is_recid'] == 1, 0, 1) - -# Convert string values to dummies, drop first so full rank -compas_dummy = pd.get_dummies( - compas, - columns=['c_charge_degree', 'race', 'age_cat', 'sex'], - drop_first=True) - -compas_dummy.drop(columns=[ - 'age', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', - 'two_year_recid', 'score_text', 'is_recid' -], - inplace=True) - -# Shuffle rows for random judge assignment -compas_shuffled = compas_dummy.sample(frac=1) - -nJudges_M = 9 - -# Assign judges as evenly as possible -judge_ID = pd.qcut(np.arange(len(compas_shuffled)), nJudges_M, labels=False) - -# Assign fixed leniencies from 0.1 to 0.9 -judge_leniency = np.arange(1, 10) / 10 - -judge_leniency = judge_leniency[judge_ID] - -compas_shuffled = compas_shuffled.assign(judge_ID=judge_ID, - judge_leniency=judge_leniency) - -# Sort by judges then probabilities in decreasing order -# Most dangerous for each judge are at the top. -compas_shuffled.sort_values(by=["judge_ID", "decile_score"], - ascending=False, - inplace=True) - -# Iterate over the data. Subject will be given a negative decision -# if they are in the top (1-r)*100% of the individuals the judge will judge. -# I.e. if their within-judge-index is under 1 - acceptance threshold times -# the number of subjects assigned to each judge they will receive a -# negative decision. -compas_shuffled.reset_index(drop=True, inplace=True) - -subjects_allocated = compas_shuffled.judge_ID.value_counts() - -compas_shuffled['judge_index'] = compas_shuffled.groupby('judge_ID').cumcount() - -compas_shuffled['decision_T'] = np.where( - compas_shuffled['judge_index'] < (1 - compas_shuffled['judge_leniency']) * - subjects_allocated[compas_shuffled['judge_ID']].values, 0, 1) - -compas_labeled = compas_shuffled.copy() -compas_unlabeled = compas_shuffled.copy() - -# Hide unobserved -compas_labeled.loc[compas_labeled.decision_T == 0, 'result_Y'] = np.nan - -# Choose feature_columns -feature_cols = ~compas_labeled.columns.isin( - ['result_Y', 'decile_score', 'judge_ID', 'judge_leniency', 'judge_index', 'decision_T']) - -feature_cols = compas_labeled.columns[feature_cols] - - -####### Draw figures ########### - -failure_rates = np.zeros((8, 6)) -error_Ns = np.zeros((8, 6)) - -# Split data -train, test_labeled = train_test_split(compas_labeled, test_size=0.5) - -# Assign same observations to unlabeled data -test_unlabeled = compas_unlabeled.iloc[test_labeled.index.values] - - -# Train a logistic regression model -B_model, predictions = fitPredictiveModel( - train.loc[train['decision_T'] == 1, feature_cols], - train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[feature_cols], 0) - -test_labeled = test_labeled.assign(B_prob_0_model=predictions) -test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions) - -test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - -kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False) - -# Find observed values -observed = test_labeled['decision_T'] == 1 - -# Assign data to the model -dat = dict(D=1, - N_obs=np.sum(observed), - N_cens=np.sum(~observed), - K=group_amount, - sigma_tau=sigma_tau, - M=len(set(compas_labeled['judge_ID'])), - jj_obs=test_labeled.loc[observed, 'judge_ID']+1, - jj_cens=test_labeled.loc[~observed, 'judge_ID']+1, - kk_obs=kk_array[observed]+1, - kk_cens=kk_array[~observed]+1, - dec_obs=test_labeled.loc[observed, 'decision_T'], - dec_cens=test_labeled.loc[~observed, 'decision_T'], - X_obs=test_labeled.loc[observed, 'B_prob_0_model'].values.reshape(-1, 1), - X_cens=test_labeled.loc[~observed, 'B_prob_0_model'].values.reshape(-1, 1), - y_obs=test_labeled.loc[observed, 'result_Y'].astype(int)) - -sm = pystan.StanModel(file=stan_code_file_name) - -fit = sm.sampling(data=dat, chains=5, iter=6000, control = dict(adapt_delta=0.9)) - -pars = fit.extract() - -print(fit, file=open(save_name + '_stan_fit_diagnostics.txt', 'w')) - -plt.figure(figsize=(15,30)) - -fit.plot(); - -plt.savefig(save_name + '_stan_diagnostic_plot') - -plt.show() -plt.close('all') - -# Bayes - -# Alusta matriisi, rivillä yksi otos posteriorista -# sarakkeet havaintoja -y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0])) - -# Täydennetään havaitsemattomat estimoiduilla -y_imp[:, ~observed] = 1-pars['y_est'] - -# Täydennetään havaitut havaituilla -y_imp[:, observed] = 1-test_labeled.loc[observed, 'result_Y'] - -Rs = np.arange(.1, .9, .1) - -to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int) - -f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan) - -for i in range(len(to_release_list)): - - failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) - - est_failure_rates = failed / test_labeled.shape[0] - - failure_rates[i, 5] = np.mean(est_failure_rates) - - error_Ns[i, 5] = test_labeled.shape[0] - -for r in np.arange(1, 9): - - print(".", end="") - - # True evaluation - - FR, N = trueEvaluationEvaluator(test_unlabeled, 'result_Y', r / 10) - - failure_rates[r - 1, 0] = FR - error_Ns[r - 1, 0] = N - - # Labeled outcomes only - - FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10) - - failure_rates[r - 1, 1] = FR - error_Ns[r - 1, 1] = N - - # Adjusted labeled outcomes - - FR, N = labeledOutcomesEvaluator(test_labeled, 'decision_T', 'result_Y', r / 10, - adjusted=True) - - failure_rates[r - 1, 2] = FR - error_Ns[r - 1, 2] = N - - # Human evaluation - - FR, N = humanEvaluationEvaluator(test_labeled, 'judge_ID', 'decision_T', - 'result_Y', 'judge_leniency', - r / 10) - - failure_rates[r - 1, 3] = FR - error_Ns[r - 1, 3] = N - - # Contraction - - FR, N = contraction(test_labeled, 'judge_ID', 'decision_T', 'result_Y', - 'B_prob_0_model', 'judge_leniency', r / 10) - - failure_rates[r - 1, 4] = FR - error_Ns[r - 1, 4] = N - -x_ax = np.arange(0.1, 0.9, 0.1) - -failure_SEs = standardError(failure_rates, error_Ns) - -labels = [ - 'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.', - 'Human evaluation', 'Contraction', 'Potential outcomes' -] -colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'c'] - -for i in range(failure_rates.shape[1]): - plt.errorbar(x_ax, - failure_rates[:, i], - label=labels[i], - c=colours[i], - yerr=failure_SEs[:, i]) - -plt.title('Failure rate vs. Acceptance rate') -plt.xlabel('Acceptance rate') -plt.ylabel('Failure rate') -plt.legend() -plt.grid() - -plt.savefig(save_name + '_all') - -plt.show() - -print("\nFailure rates:") -print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) - -print("\nMean absolute errors:") -for i in range(1, failure_rates.shape[1]): - print( - labels[i].ljust(len(max(labels, key=len))), - np.round( - np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5)) \ No newline at end of file diff --git a/analysis_and_scripts/stan_modelling_theoretic.py b/analysis_and_scripts/stan_modelling_theoretic.py index 4f3c5a98202f1bab666a3b7021b8e06d89ab5d10..47847a9889b6d64aec136305ffad1aef45b8d722 100644 --- a/analysis_and_scripts/stan_modelling_theoretic.py +++ b/analysis_and_scripts/stan_modelling_theoretic.py @@ -1,29 +1,24 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- ''' # Author: Riku Laine -# Date: 25JUL2019 (start) +# Date: 12AUG2019 # Project name: Potential outcomes in model evaluation # Description: This script creates the figures and results used # in synthetic data experiments. # # Parameters: # ----------- -# (1) figure_path : file name for saving the created figures. +# (1) figure_path : file location for saving the created figures. # (2) N_sim : Size of simulated data set. # (3) M_sim : Number of judges in simulated data, # N_sim must be divisible by M_sim! -# (4) which : Which data + outcome analysis should be performed. -# (5) group_amount : How many groups if Jung-inspired model is used. +# (4) which : Which data + outcome analysis should be performed. Ranges from +# 1 to 14. See at the end of the script which figure is which. +# Separate execution preferred for keeping runtime and memory decent. +# (5) group_amount : How many groups if non-linear model is used. # (6) stan_code_file_name : Name of file containing the stan model code. -# (7) sigma_tau : Values of prior variance for the Jung-inspired model. -# -# Modifications: -# -------------- -# 26JUL2019 : RL - Changed add_epsilon default to True -# - Corrected the data set used by the causal model (from -# unlabeled to labeled) -# - Corrections to which == 11, coefficients and prints. -# - All plot, thinning off. -# - Fix one decider with leniency 0.9 +# (7) sigma_tau : Values of prior variance. # ''' # Refer to the `notes.tex` file for explanations about the modular framework. @@ -35,7 +30,6 @@ import pandas as pd import matplotlib.pyplot as plt import scipy.stats as scs import scipy.special as ssp -import scipy.integrate as si import numpy.random as npr from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier @@ -85,8 +79,7 @@ print("Prior for the variances:", sigma_tau) # Basic functions - -def inv_logit(x): +def inverseLogit(x): return 1.0 / (1.0 + np.exp(-1.0 * x)) @@ -94,11 +87,16 @@ def logit(x): return np.log(x) - np.log(1.0 - x) -def inverse_cumulative(x, mu, sigma): +def inverseCumulative(x, mu, sigma): '''Compute the inverse of the cumulative distribution of logit-normal distribution at x with parameters mu and sigma (mean and st.dev.).''' - return inv_logit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) + return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) + +def standardError(p, n): + denominator = p * (1 - p) + + return np.sqrt(denominator / n) # ## Data generation modules @@ -111,11 +109,11 @@ def bernoulliDGWithoutUnobservables(N_total=50000): # Sample feature X from standard Gaussian distribution, N(0, 1). df = df.assign(X=npr.normal(size=N_total)) - # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inv_logit(X) - df = df.assign(probabilities_Y=inv_logit(df.X)) + # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X) + df = df.assign(probabilities_Y=inverseLogit(df.X)) - # Draw Y ~ Bernoulli(1 - inv_logit(X)) - # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inv_logit(X) + # Draw Y ~ Bernoulli(1 - inverseLogit(X)) + # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X) results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total) df = df.assign(result_Y=results) @@ -137,7 +135,7 @@ def thresholdDGWithUnobservables(N_total=50000, df = df.assign(W=npr.normal(size=N_total)) # Calculate P(Y=0|X, Z, W) - probabilities_Y = inv_logit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) + probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) df = df.assign(probabilities_Y=probabilities_Y) @@ -160,8 +158,8 @@ def bernoulliDGWithUnobservables(N_total=50000, df = df.assign(Z=npr.normal(size=N_total)) df = df.assign(W=npr.normal(size=N_total)) - # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inv_logit(X) - probabilities_Y = inv_logit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) + # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X) + probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) df = df.assign(probabilities_Y=probabilities_Y) @@ -183,6 +181,7 @@ def humanDeciderLakkaraju(df, beta_Z=1, add_epsilon=True): '''Decider module | Non-independent batch decisions.''' + # Decider as specified by Lakkaraju et al. # Assert that every judge will have the same number of subjects. assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" @@ -195,23 +194,24 @@ def humanDeciderLakkaraju(df, # Sample acceptance rates uniformly from a closed interval # from 0.1 to 0.9 and round to tenth decimal place. - # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all - # values. - acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) + acceptance_rates = npr.uniform(.1, .9, nJudges_M) acceptance_rates = np.round(acceptance_rates, 10) + # Replicate the rates so they can be attached to the corresponding judge ID. df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) + # Add noise if add_epsilon: epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) else: epsilon = 0 + # Compute probability for jail decision if featureZ_col is None: - probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon) + probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) else: - probabilities_T = inv_logit(beta_X * df[featureX_col] + + probabilities_T = inverseLogit(beta_X * df[featureX_col] + beta_Z * df[featureZ_col] + epsilon) @@ -250,8 +250,9 @@ def bernoulliDecider(df, beta_Z=1, add_epsilon=True): '''Use X and Z to make a decision with probability - P(T=0|X, Z)=inv_logit(beta_X*X+beta_Z*Z).''' - + P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).''' + # No leniencies involved + # Assert that every judge will have the same number of subjects. assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" @@ -267,9 +268,9 @@ def bernoulliDecider(df, epsilon = 0 if featureZ_col is None: - probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon) + probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) else: - probabilities_T = inv_logit(beta_X * df[featureX_col] + + probabilities_T = inverseLogit(beta_X * df[featureX_col] + beta_Z * df[featureZ_col] + epsilon) df = df.assign(probabilities_T=probabilities_T) @@ -298,7 +299,8 @@ def quantileDecider(df, nJudges_M=100, beta_X=1, beta_Z=1, - add_epsilon=True): + add_epsilon=True, + leniency_upper_limit=0.9): '''Assign decisions by the value of inverse cumulative distribution function of the logit-normal distribution at leniency r.''' @@ -313,9 +315,7 @@ def quantileDecider(df, # Sample acceptance rates uniformly from a closed interval # from 0.1 to 0.9 and round to tenth decimal place. - # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all - # values. - acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) + acceptance_rates = npr.uniform(.1, leniency_upper_limit, nJudges_M) acceptance_rates = np.round(acceptance_rates, 10) # Replicate the rates so they can be attached to the corresponding judge ID. @@ -327,20 +327,20 @@ def quantileDecider(df, epsilon = 0 if featureZ_col is None: - probabilities_T = inv_logit(beta_X * df[featureX_col] + epsilon) + probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) # Compute the bounds straight from the inverse cumulative. # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2. - df = df.assign(bounds=inverse_cumulative( + df = df.assign(bounds=inverseCumulative( x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2))) else: - probabilities_T = inv_logit(beta_X * df[featureX_col] + + probabilities_T = inverseLogit(beta_X * df[featureX_col] + beta_Z * df[featureZ_col] + epsilon) # Compute the bounds straight from the inverse cumulative. # Assuming X and Z are i.i.d standard Gaussians with variance 1. # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z). - df = df.assign(bounds=inverse_cumulative( + df = df.assign(bounds=inverseCumulative( x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2))) df = df.assign(probabilities_T=probabilities_T) @@ -348,6 +348,12 @@ def quantileDecider(df, # Assign negative decision if the predicted probability (probabilities_T) is # over the judge's threshold (bounds). df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1)) + + # Calculate the observed acceptance rates. + acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values + + # Replicate the rates so they can be attached to the corresponding judge ID. + df.acceptanceRate_R = np.repeat(acceptance_rates, nSubjects_N) df_labeled = df.copy() @@ -360,7 +366,7 @@ def randomDecider(df, nJudges_M=100, use_acceptance_rates=False): '''Doesn't use any information about X and Z to make decisions. If use_acceptance_rates is False (default) then all decisions are positive - with probabiltiy 0.5. If True, probabilities will be sampled from + with probability 0.5. If True, probabilities will be sampled from U(0.1, 0.9) and rounded to tenth decimal place.''' # Assert that every judge will have the same number of subjects. @@ -375,7 +381,10 @@ def randomDecider(df, nJudges_M=100, use_acceptance_rates=False): if use_acceptance_rates: # Sample acceptance rates uniformly from a closed interval # from 0.1 to 0.9 and round to tenth decimal place. - acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10) + # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all + # values. + acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) + acceptance_rates = np.round(acceptance_rates, 10) else: # No real leniency here -> set to 0.5. acceptance_rates = np.ones(nJudges_M) * 0.5 @@ -402,7 +411,9 @@ def biasDecider(df, add_epsilon=True): ''' Biased decider: If X > 1, then X <- X * 0.75. People with high X, - get more positive decisions as they should. + get more positive decisions as they should. And if -2 < X -1, then + X <- X + 0.5. People with X in [-2, 1], get less positive decisions + as they should. ''' @@ -420,7 +431,7 @@ def biasDecider(df, assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" # Use quantile decider, but judge by the biased X. - df_labeled, df = humanDeciderLakkaraju(df, + df_labeled, df = quantileDecider(df, featureX_col='biased_X', featureZ_col=featureZ_col, nJudges_M=nJudges_M, @@ -522,97 +533,6 @@ def getProbabilityForClass(x, model, class_value): # Get correct column of predicted class, remove extra dimensions and return. return f_values[:, model.classes_ == class_value].flatten() - -def cdf(x_0, model, class_value): - ''' - Cumulative distribution function as described above. Integral is - approximated using Simpson's rule for efficiency. - - Arguments: - ---------- - - x_0 -- private features of an instance for which the value of cdf is to be - calculated. - model -- a trained sklearn model. Predicts probabilities for given x. - Should accept input of shape (observations, features) - class_value -- the resulting class to predict (usually 0 or 1). - - ''' - - def prediction(x): - return getProbabilityForClass( - np.array([x]).reshape(-1, 1), model, class_value) - - prediction_x_0 = prediction(x_0) - - x_values = np.linspace(-15, 15, 40000) - - x_preds = prediction(x_values) - - y_values = scs.norm.pdf(x_values) - - results = np.zeros(x_0.shape[0]) - - for i in range(x_0.shape[0]): - - y_copy = y_values.copy() - - y_copy[x_preds > prediction_x_0[i]] = 0 - - results[i] = si.simps(y_copy, x=x_values) - - return results - - -def bailIndicator(r, y_model, x_train, x_test): - ''' - Indicator function for whether a judge will bail or jail a suspect. - Rationale explained above. - - Algorithm: - ---------- - - (1) Calculate recidivism probabilities from training set with a trained - model and assign them to predictions_train. - - (2) Calculate recidivism probabilities from test set with the trained - model and assign them to predictions_test. - - (3) Construct a quantile function of the probabilities in - in predictions_train. - - (4) - For pred in predictions_test: - - if pred belongs to a percentile (computed from step (3)) lower than r - return True - else - return False - - Arguments: - ---------- - - r -- float, acceptance rate, between 0 and 1 - y_model -- a trained sklearn predictive model to predict the outcome - x_train -- private features of the training instances - x_test -- private features of the test instances - - Returns: - -------- - (1) Boolean list indicating a bail decision (bail = True) for each - instance in x_test. - ''' - - predictions_train = getProbabilityForClass(x_train, y_model, 0) - - predictions_test = getProbabilityForClass(x_test, y_model, 0) - - return [ - scs.percentileofscore(predictions_train, pred, kind='weak') < r - for pred in predictions_test - ] - - # ### Contraction algorithm # # Below is an implementation of Lakkaraju's team's algorithm presented in @@ -649,6 +569,29 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, # Subset. "D_q is the set of all observations judged by q." D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() + + ##### Agreement rate computation begins ####### + + max_leniency = max(df[accRateR_col]) + + if max_leniency < r: + return np.nan, np.nan + + # Sort D_q + # "Observations deemed as high risk by B are at the top of this list" + D_sort_q = D_q.sort_values(by=modelProbS_col, ascending=False) + + # The agreement rate is now computed as the percentage of all the positive + # decisions given by q in the 1-r % of the most dangerous subjects. + all_negatives = np.sum(D_sort_q[decisionT_col] == 0) + + n_most_dangerous = int(round(D_q.shape[0] * (1-max_leniency))) + + agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous] == 0) + + print("The agreement rate of contraction is:", agreed_decisions/all_negatives) + + ##### Agreement rate computation ends ####### # All observations of R_q have observed outcome labels. # "R_q is the set of observations in D_q with observed outcome labels." @@ -665,50 +608,20 @@ def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, # "R_B is the list of observations assigned to t = 1 by B" R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]] - return np.sum(R_B[resultY_col] == 0) / D_q.shape[0] + return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0] # ### Evaluators -def contractionEvaluator(df, featureX_col, judgeIDJ_col, decisionT_col, - resultY_col, accRateR_col, r): - - train, test = train_test_split(df, test_size=0.5) - - B_model, predictions = fitPredictiveModel( - train.loc[train[decisionT_col] == 1, featureX_col], - train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], - 0) - - test = test.assign(B_prob_0_model=predictions) - - # Invoke the original contraction. - FR = contraction(test, - judgeIDJ_col=judgeIDJ_col, - decisionT_col=decisionT_col, - resultY_col=resultY_col, - modelProbS_col="B_prob_0_model", - accRateR_col=accRateR_col, - r=r) - - return FR - - def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r): - train, test = train_test_split(df, test_size=0.5) - - B_model, predictions = fitPredictiveModel(train[featureX_col], - train[resultY_col], - test[featureX_col], 0) + df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - test = test.assign(B_prob_0_model=predictions) - - test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(test.shape[0] * r)) + to_release = int(round(df.shape[0] * r)) + + failed = df[resultY_col][0:to_release] == 0 - return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0] + return np.sum(failed) / df.shape[0], df.shape[0] def labeledOutcomesEvaluator(df, @@ -718,28 +631,20 @@ def labeledOutcomesEvaluator(df, r, adjusted=False): - train, test = train_test_split(df, test_size=0.5) - - B_model, predictions = fitPredictiveModel( - train.loc[train[decisionT_col] == 1, featureX_col], - train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], - 0) + df_observed = df.loc[df[decisionT_col] == 1, :] - test = test.assign(B_prob_0_model=predictions) - - test_observed = test.loc[test[decisionT_col] == 1, :] - - test_observed = test_observed.sort_values(by='B_prob_0_model', + df_observed = df_observed.sort_values(by='B_prob_0_model', inplace=False, ascending=True) - to_release = int(round(test_observed.shape[0] * r)) + to_release = int(round(df_observed.shape[0] * r)) + + failed = df_observed[resultY_col][0:to_release] == 0 if adjusted: - return np.mean(test_observed[resultY_col][0:to_release] == 0) + return np.mean(failed), df.shape[0] - return np.sum( - test_observed[resultY_col][0:to_release] == 0) / test.shape[0] + return np.sum(failed) / df.shape[0], df.shape[0] def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, @@ -750,7 +655,7 @@ def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, # No judges with correct leniency if np.sum(is_correct_leniency) == 0: - return np.nan + return np.nan, np.nan correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col] @@ -758,24 +663,10 @@ def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, released = df[df[judgeIDJ_col].isin(correct_leniency_list) & (df[decisionT_col] == 1)] + failed = released[resultY_col] == 0 + # Get their failure rate, aka ratio of reoffenders to number of people judged in total - return np.sum(released[resultY_col] == 0) / correct_leniency_list.shape[0] - - -def causalEvaluator(df, featureX_col, decisionT_col, resultY_col, r): - - train, test = train_test_split(df, test_size=0.5) - - B_model, predictions = fitPredictiveModel( - train.loc[train[decisionT_col] == 1, featureX_col], - train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], - 0) - - test = test.assign(B_prob_0_model=predictions) - - released = cdf(test[featureX_col], B_model, 0) < r - - return np.mean(test.B_prob_0_model * released) + return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0] def monteCarloEvaluator(df, @@ -791,27 +682,17 @@ def monteCarloEvaluator(df, sigma_X=1, sigma_Z=1): - # Train the models and assign the predicted probabilities. - train, test = train_test_split(df, test_size=0.5) - - B_model, predictions = fitPredictiveModel( - train.loc[train[decisionT_col] == 1, featureX_col], - train.loc[train[decisionT_col] == 1, resultY_col], test[featureX_col], - 0) - - test = test.assign(B_prob_0_model=predictions) - # Compute the predicted/assumed decision bounds for all the judges. - q_r = inverse_cumulative(x=test[accRateR_col], + q_r = inverseCumulative(x=df[accRateR_col], mu=mu_X + mu_Z, sigma=np.sqrt((beta_X * sigma_X)**2 + (beta_Z * sigma_Z)**2)) - test = test.assign(bounds=logit(q_r) - test[featureX_col]) + df = df.assign(bounds=logit(q_r) - df[featureX_col]) # Compute the expectation of Z when it is known to come from truncated # Gaussian. - alphabeta = (test.bounds - mu_Z) / (sigma_Z) + alphabeta = (df.bounds - mu_Z) / (sigma_Z) Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z) # 1 - cdf(ab) @@ -822,31 +703,27 @@ def monteCarloEvaluator(df, exp_upper_trunc = mu_Z - ( sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta) - exp_Z = (1 - test[decisionT_col] - ) * exp_lower_trunc + test[decisionT_col] * exp_upper_trunc + exp_Z = (1 - df[decisionT_col] + ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc # Attach the predicted probability for Y=0 to data. - test = test.assign(predicted_Y=inv_logit(test[featureX_col] + exp_Z)) + df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z)) - # Predictions drawn from binomial. (Should fix this.) - predictions = npr.binomial(n=1, p=1 - test.predicted_Y, size=test.shape[0]) + # Predictions drawn from binomial. + predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0]) - test[resultY_col] = np.where(test[decisionT_col] == 0, predictions, - test[resultY_col]) + df[resultY_col] = np.where(df[decisionT_col] == 0, predictions, + df[resultY_col]) - test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(test.shape[0] * r)) - - return np.sum(test[resultY_col][0:to_release] == 0) / test.shape[0] + df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) + to_release = int(round(df.shape[0] * r)) + + failed = df[resultY_col][0:to_release] == 0 -# ## Performance comparison -# -# Below we try to replicate the results obtained by Lakkaraju and compare -# their model's performance to the one of ours. + return np.sum(failed) / df.shape[0], df.shape[0] -def drawDiagnostics(title, save_name, save, f_rates, titles): +def drawDiagnostics(title, save_name, f_rates, titles): cols = 2 rows = np.ceil(len(f_rates) / cols) @@ -874,394 +751,485 @@ def drawDiagnostics(title, save_name, save, f_rates, titles): plt.grid() plt.tight_layout() - plt.subplots_adjust(top=0.89) + plt.subplots_adjust(top=0.85) plt.suptitle(title, y=0.96, weight='bold') - if save: - plt.savefig(save_name + '_diagnostic_plot') + plt.savefig(save_name + '_diagnostic_plot') plt.show() - -def perfComp(dgModule, deciderModule, title, save_name, save=True, nIter=50): - failure_rates = np.zeros((8, 7)) - failure_sems = np.zeros((8, 7)) +def perfComp(dgModule, deciderModule, title, save_name, model_type=None, + fit_with_Z=False, nIter=10): + '''Performance comparison, a.k.a the main figure. + + Parameters: + ----------- + dgModule: Module creating the data + decisderModule: Module assigning the decisions + title: Title for the diagnostic figure + save_name: name for saving + model_type: what kind of model should be fitted. Logistic regression + ("lr"), random forest ("rf") or fully random + ("fully_random") + fit_with_Z: should the predictive model be fit with Z (the latent + variable) + nIter: Number of train-test splits to be performed. + + ''' + + failure_rates = np.zeros((8, 4)) + failure_stds = np.zeros((8, 4)) f_rate_true = np.zeros((nIter, 8)) f_rate_label = np.zeros((nIter, 8)) - f_rate_label_adj = np.zeros((nIter, 8)) - f_rate_human = np.zeros((nIter, 8)) f_rate_cont = np.zeros((nIter, 8)) - f_rate_caus = np.zeros((nIter, 8)) + f_rate_cf = np.zeros((nIter, 8)) # Create data df = dgModule() - # Decicions + # Assign decicions df_labeled, df_unlabeled = deciderModule(df) - - # Split data - train, test = train_test_split(df_labeled, test_size=0.5) - - # Train model - B_model, predictions = fitPredictiveModel( - train.loc[train['decision_T'] == 1, 'X'], - train.loc[train['decision_T'] == 1, 'result_Y'], test['X'], 0) - - test = test.assign(B_prob_0_model=predictions) - - test.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - kk_array = pd.qcut(test['B_prob_0_model'], group_amount, labels=False) - - # Find observed values - observed = test['decision_T'] == 1 - - # Assign data to the model - dat = dict(D=1, - N_obs=np.sum(observed), - N_cens=np.sum(~observed), - K=group_amount, - sigma_tau=sigma_tau, - M=len(set(df_unlabeled['judgeID_J'])), - jj_obs=test.loc[observed, 'judgeID_J']+1, - jj_cens=test.loc[~observed, 'judgeID_J']+1, - kk_obs=kk_array[observed]+1, - kk_cens=kk_array[~observed]+1, - dec_obs=test.loc[observed, 'decision_T'], - dec_cens=test.loc[~observed, 'decision_T'], - X_obs=test.loc[observed, 'B_prob_0_model'].values.reshape(-1,1), - X_cens=test.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1), - y_obs=test.loc[observed, 'result_Y'].astype(int)) - - fit = sm.sampling(data=dat, chains=5, iter=4000, control = dict(adapt_delta=0.9)) - - pars = fit.extract() - - plt.figure(figsize=(15,30)) - - fit.plot(); - - if save: - plt.savefig(save_name + '_stan_diagnostic_plot') - plt.show() - plt.close('all') - - if save: - print(fit, file=open(save_name + '_stan_fit_diagnostics.txt', 'w')) + for i in range(nIter): - # Bayes - - # Alusta matriisi, rivillä yksi otos posteriorista - # sarakkeet havaintoja - y_imp = np.ones((pars['y_est'].shape[0], test.shape[0])) + # Split data + train, test_labeled = train_test_split(df_labeled, test_size=0.5) + + # Assign same observations to unlabeled dat + test_unlabeled = df_unlabeled.iloc[test_labeled.index.values] + + # Train model + if fit_with_Z: + B_model, predictions = fitPredictiveModel( + train.loc[train['decision_T'] == 1, ['X', 'Z']], + train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[['X', 'Z']], + 0, model_type=model_type) - # Täydennetään havaitsemattomat estimoiduilla - y_imp[:, ~observed] = 1-pars['y_est'] + else: + B_model, predictions = fitPredictiveModel( + train.loc[train['decision_T'] == 1, 'X'], + train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], + 0, model_type=model_type) + + # Attach predictions to data + test_labeled = test_labeled.assign(B_prob_0_model=predictions) + test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions) - # Täydennetään havaitut havaituilla - y_imp[:, observed] = 1-test.loc[observed, 'result_Y'] - - Rs = np.arange(.1, .9, .1) + test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - to_release_list = np.round(test.shape[0] * Rs).astype(int) + # Assign group IDs + kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False) - f_rate_bayes = np.full((pars['y_est'].shape[0], 8), np.nan) + # Find observed values + observed = test_labeled['decision_T'] == 1 - for i in range(len(to_release_list)): - est_failure_rates = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) / test.shape[0] + # Assign data to the model - f_rate_bayes[:, i] = est_failure_rates + ############################### + # Change this assignment if you want to use predictions as input to the + # model. + ############################### + dat = dict(D=1, + N_obs=np.sum(observed), + N_cens=np.sum(~observed), + K=group_amount, + sigma_tau=sigma_tau, + M=len(set(df_unlabeled['judgeID_J'])), + jj_obs=test_labeled.loc[observed, 'judgeID_J']+1, + jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1, + kk_obs=kk_array[observed]+1, + kk_cens=kk_array[~observed]+1, + dec_obs=test_labeled.loc[observed, 'decision_T'], + dec_cens=test_labeled.loc[~observed, 'decision_T'], + X_obs=test_labeled.loc[observed, 'X'].values.reshape(-1,1), + X_cens=test_labeled.loc[~observed, 'X'].values.reshape(-1,1), + y_obs=test_labeled.loc[observed, 'result_Y'].astype(int)) + + # Do the sampling - Change this if you wish to use other methods. + fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9)) + + pars = fit.extract() - failure_rates[i, 6] = np.mean(est_failure_rates) - - for i in range(nIter): - print(" [", i, "] ", sep='', end="") - - for r in np.arange(1, 9): + + plt.figure(figsize=(15,30)) + + fit.plot(); + + plt.savefig(save_name + '_stan_convergence_diagnostic_plot_' + str(i)) + + plt.show() + plt.close('all') + gc.collect() + + print(fit, file=open(save_name + '_stan_fit_diagnostics_' + str(i) + '.txt', 'w')) + # Bayes + + # Format matrix, each row is a sample of the posterior + # columns are observations + + y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0])) + + # Impute the unobserved with the estimated + # Reverse the binary coding to compute failures as sum. 0 == 0 = 1 + y_imp[:, ~observed] = 1 - pars['y_est'] + + # Remove the variables to conserve memory + del(pars) + del(fit) + + # Impute the observed + y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y'] + + for r in range(1, 9): + + to_release = np.round(test_labeled.shape[0] * (r / 10)).astype(int) + + failed = np.sum(y_imp[:, 0:to_release], axis=1) + + f_rate_cf[i, r - 1] = np.mean(failed / test_labeled.shape[0]) + print(".", end="") - + # True evaluation - - f_rate_true[i, r - 1] = trueEvaluationEvaluator( - df_unlabeled, 'X', 'decision_T', 'result_Y', r / 10) - + + f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled, + 'X', 'decision_T', 'result_Y', r / 10) + # Labeled outcomes only - - f_rate_label[i, r - 1] = labeledOutcomesEvaluator( - df_labeled, 'X', 'decision_T', 'result_Y', r / 10) - - # Adjusted labeled outcomes - - f_rate_label_adj[i, r - 1] = labeledOutcomesEvaluator( - df_labeled, - 'X', - 'decision_T', - 'result_Y', - r / 10, - adjusted=True) - - # Human evaluation - - f_rate_human[i, r - 1] = humanEvaluationEvaluator( - df_labeled, 'judgeID_J', 'decision_T', 'result_Y', - 'acceptanceRate_R', r / 10) - + + f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled, + 'X', 'decision_T', 'result_Y', r / 10) + # Contraction - - f_rate_cont[i, r - 1] = contractionEvaluator( - df_labeled, 'X', 'judgeID_J', 'decision_T', 'result_Y', - 'acceptanceRate_R', r / 10) - - # Causal model - analytic solution - - f_rate_caus[i, r - 1] = monteCarloEvaluator( - df_labeled, 'X', 'decision_T', 'result_Y', 'acceptanceRate_R', - r / 10) - + + f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judgeID_J', + 'decision_T', 'result_Y', 'B_prob_0_model', + 'acceptanceRate_R', r / 10) + failure_rates[:, 0] = np.mean(f_rate_true, axis=0) failure_rates[:, 1] = np.mean(f_rate_label, axis=0) - failure_rates[:, 2] = np.mean(f_rate_label_adj, axis=0) - failure_rates[:, 3] = np.mean(f_rate_human, axis=0) - failure_rates[:, 4] = np.mean(f_rate_cont, axis=0) - failure_rates[:, 5] = np.mean(f_rate_caus, axis=0) - #failure_rates[:, 6] = f_rate_bayes - - failure_sems[:, 0] = scs.sem(f_rate_true, axis=0) - failure_sems[:, 1] = scs.sem(f_rate_label, axis=0) - failure_sems[:, 2] = scs.sem(f_rate_label_adj, axis=0) - failure_sems[:, 3] = scs.sem(f_rate_human, axis=0) - failure_sems[:, 4] = scs.sem(f_rate_cont, axis=0) - failure_sems[:, 5] = scs.sem(f_rate_caus, axis=0) - failure_sems[:, 6] = scs.sem(f_rate_bayes, axis=0, nan_policy='omit') + failure_rates[:, 2] = np.mean(f_rate_cont, axis=0) + failure_rates[:, 3] = np.mean(f_rate_cf, axis=0) + + failure_stds[:, 0] = np.std(f_rate_true, axis=0) + failure_stds[:, 1] = np.std(f_rate_label, axis=0) + failure_stds[:, 2] = np.std(f_rate_cont, axis=0) + failure_stds[:, 3] = np.std(f_rate_cf, axis=0) x_ax = np.arange(0.1, 0.9, 0.1) - labels = [ - 'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.', - 'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes' - ] - colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c'] + labels = ['True Evaluation', 'Labeled outcomes', 'Contraction', + 'Counterfactuals'] + + colours = ['g', 'magenta', 'b', 'r'] + + line_styles = ['--', ':', '-.', '-'] for i in range(failure_rates.shape[1]): plt.errorbar(x_ax, failure_rates[:, i], label=labels[i], c=colours[i], - yerr=failure_sems[:, i]) + linestyle=line_styles[i], + yerr=failure_stds[:, i]) - plt.title('Failure rate vs. Acceptance rate') + #plt.title('Failure rate vs. Acceptance rate') plt.xlabel('Acceptance rate') plt.ylabel('Failure rate') plt.legend() plt.grid() - if save: - plt.savefig(save_name + '_all') + plt.savefig(save_name + '_all') plt.show() print("\nFailure rates:") print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) + print("\nStandard deviations:") + print(np.array2string(failure_stds, formatter={'float_kind':lambda x: "%.5f" % x})) + print("\nMean absolute errors:") for i in range(1, failure_rates.shape[1]): print( labels[i].ljust(len(max(labels, key=len))), np.round( np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5)) - + drawDiagnostics(title=title, - save_name=save_name, - save=save, - f_rates=[ - f_rate_true, f_rate_label, f_rate_label_adj, - f_rate_human, f_rate_cont, f_rate_caus, f_rate_bayes - ], - titles=labels) - + save_name=save_name, + f_rates=[f_rate_true, f_rate_label, f_rate_cont, f_rate_cf], + titles=labels) +# Compile stan model sm = pystan.StanModel(file=stan_code_file_name) if which == 1: - print("Without unobservables (Bernoulli + independent decisions)") + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: random and random") - dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim) + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1) + decider = lambda x: randomDecider(x, nJudges_M=M_sim, use_acceptance_rates=True) perfComp( dg, lambda x: decider(x), "Fluctuation of failure rate estimates across iterations\n" + "Bernoulli + independent decisions, without unobservables", - figure_path + "sl_bernoulli_independent_without_Z" + figure_path + "sl_bernoulli_independent_without_Z", + model_type="fully_random", + fit_with_Z=False ) - -gc.collect() + plt.close('all') - -print("With unobservables in the data") +gc.collect() if which == 2: - print("\nBernoulli + independent decisions") + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: random and y ~ x") dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) + decider = lambda x: randomDecider(x, nJudges_M=M_sim, use_acceptance_rates=True) perfComp( dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Bernoulli + independent decisions, with unobservables", - figure_path + "sl_bernoulli_independent_with_Z", + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=False ) -gc.collect() -plt.close('all') - if which == 3: - print("\nThreshold rule + independent decisions") + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: random and y ~ x + z") - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim) + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) + decider = lambda x: randomDecider(x, nJudges_M=M_sim, use_acceptance_rates=True) perfComp( dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Threshold rule + independent decisions, with unobservables", - figure_path + "sl_threshold_independent_with_Z", + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=True ) -gc.collect() -plt.close('all') - if which == 4: - print("\nBernoulli + non-independent (batch) decisions") + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: y ~ x and random") dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1) perfComp( dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Bernoulli + non-independent decisions, with unobservables", - figure_path + "sl_bernoulli_batch_with_Z", + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="fully_random", + fit_with_Z=False ) -gc.collect() -plt.close('all') - if which == 5: - print("\nThreshold rule + non-independent (batch) decisions") + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: y ~ x and y ~ x") - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim) + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1) perfComp( dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Threshold rule + non-independent decisions, with unobservables", - figure_path + "sl_threshold_batch_with_Z", + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=False ) - -gc.collect() -plt.close('all') - + if which == 6: - print("\nRandom decider") + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: y ~ x and y ~ x + z") dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: randomDecider( - x, nJudges_M=M_sim, use_acceptance_rates=True) + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1) perfComp( dg, lambda x: decider(x), - "Bernoulli + random decider with leniency and unobservables", - figure_path + "sl_random_decider_with_Z", + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=True ) + +if which == 7: + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: y ~ x + z and random") -gc.collect() -plt.close('all') + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) -if which == 7: - print("\nBiased decider") + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1) + + perfComp( + dg, lambda x: decider(x), + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="fully_random", + fit_with_Z=False + ) + +if which == 8: + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: y ~ x + z and y ~ x") dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True) + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1) perfComp( dg, lambda x: decider(x), - "Bernoulli + biased decider with leniency and unobservables", - figure_path + "sl_biased_decider_with_Z", + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=False ) + +if which == 9: + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: y ~ x + z and y ~ x + z") + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) -if which == 8: - print("\nBad judge") + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1) + + perfComp( + dg, lambda x: decider(x), + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=True + ) + +if which == 10: + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: biased and random") dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim) + decider = lambda x: biasDecider( + x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1) perfComp( dg, lambda x: decider(x), - "Bernoulli + 'bad' decider with leniency and unobservables", - figure_path + "sl_bad_decider_with_Z" + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="fully_random", + fit_with_Z=False ) -gc.collect() -plt.close('all') +if which == 11: + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: biased and y ~ x") -if which == 9: - print("\nBernoulli + Bernoulli") + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) + + decider = lambda x: biasDecider( + x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1) + + perfComp( + dg, lambda x: decider(x), + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=False + ) + +if which == 12: + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Decision-maker in the data and model: biased and y ~ x + z") dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim) + decider = lambda x: biasDecider( + x, featureX_col="X", featureZ_col='Z', nJudges_M=M_sim, beta_X=1, beta_Z=1) perfComp( dg, lambda x: decider(x), - "Bernoulli + Bernoulli", - figure_path + "sl_bernoulli_bernoulli_with_Z", + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=True ) -if which == 10: - print("\nBeta_Z = 3, Threshold + batch") +if which == 13: + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Fewer subjects per decision-maker, from 500 to 100. Now N_total =", N_sim / 5) - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0) + dg = lambda: bernoulliDGWithUnobservables(N_total=int(N_sim / 5)) - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True) + decider = lambda x: quantileDecider( + x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1) perfComp( dg, lambda x: decider(x), - "Beta_Z = 3, threshold + batch", - figure_path + "sl_threshold_batch_beta_Z_3_with_Z", + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=True ) -if which == 11: - print("\nBeta_Z = 5, Threshold + batch") +if which == 14: + print("\nWith unobservables (Bernoullian outcome + independent decisions)") + + print("Now R ~ Uniform(0.1, 0.4)") - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0) + dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True) + decider = lambda x: quantileDecider(x, featureX_col="X", featureZ_col=None, + nJudges_M=M_sim, beta_X=1, beta_Z=1, + leniency_upper_limit=0.4) perfComp( dg, lambda x: decider(x), - "Beta_Z = 5, threshold + batch", - figure_path + "sl_threshold_batch_beta_Z_5_with_Z", - ) \ No newline at end of file + "Fluctuation of failure rate estimates across iterations\n" + + "Bernoulli + independent decisions, without unobservables", + figure_path + "sl_bernoulli_independent_without_Z", + model_type="lr", + fit_with_Z=True + ) diff --git a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py b/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py deleted file mode 100644 index 4ea26d06107af27e62f35193be53082b2253f5e9..0000000000000000000000000000000000000000 --- a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useX.py +++ /dev/null @@ -1,1101 +0,0 @@ -''' -# Author: Riku Laine -# Date: 25JUL2019 (start) -# Project name: Potential outcomes in model evaluation -# Description: This script creates the figures and results used -# in synthetic data experiments. -# -# Parameters: -# ----------- -# (1) figure_path : file name for saving the created figures. -# (2) N_sim : Size of simulated data set. -# (3) M_sim : Number of judges in simulated data, -# N_sim must be divisible by M_sim! -# (4) which : Which data + outcome analysis should be performed. -# (5) group_amount : How many groups if Jung-inspired model is used. -# (6) stan_code_file_name : Name of file containing the stan model code. -# (7) sigma_tau : Values of prior variance for the Jung-inspired model. -# (8) model_type : What model type to be fitted. Options: -# - "lr" : logistic regression -# - "rf" : random forest -# - "fully_random" : Fully random, all predictions will be 0.5. -# -''' -# Refer to the `notes.tex` file for explanations about the modular framework. - -# Imports - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import scipy.stats as scs -import scipy.special as ssp -import scipy.integrate as si -import numpy.random as npr -from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split -import pystan -import gc - -plt.switch_backend('agg') - -import sys - -# figure storage name -figure_path = sys.argv[1] - -# Size of simulated data set -N_sim = int(sys.argv[2]) - -# Number of judges in simulated data, N_sim must be divisible by M_sim! -M_sim = int(sys.argv[3]) - -# Which data + outcome generation should be performed. -which = int(sys.argv[4]) - -# How many groups if jung model is used -group_amount = int(sys.argv[5]) - -# Name of stan model code file -stan_code_file_name = sys.argv[6] - -# Variance prior -sigma_tau = float(sys.argv[7]) - -# Type of model to be fitted -model_type = sys.argv[8] - -# Settings - -plt.rcParams.update({'font.size': 16}) -plt.rcParams.update({'figure.figsize': (10, 6)}) - -print("These results have been obtained with the following settings:") - -print("Number of observations in the simulated data:", N_sim) - -print("Number of judges in the simulated data:", M_sim) - -print("Number of groups:", group_amount) - -print("Prior for the variances:", sigma_tau) - -# Basic functions - - -def inverseLogit(x): - return 1.0 / (1.0 + np.exp(-1.0 * x)) - - -def logit(x): - return np.log(x) - np.log(1.0 - x) - - -def inverseCumulative(x, mu, sigma): - '''Compute the inverse of the cumulative distribution of logit-normal - distribution at x with parameters mu and sigma (mean and st.dev.).''' - - return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) - -def standardError(p, n): - denominator = p * (1 - p) - - return np.sqrt(denominator / n) - - -# ## Data generation modules - -def bernoulliDGWithoutUnobservables(N_total=50000): - '''Generates data | Variables: X, Y | Outcome from Bernoulli''' - - df = pd.DataFrame() - - # Sample feature X from standard Gaussian distribution, N(0, 1). - df = df.assign(X=npr.normal(size=N_total)) - - # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X) - df = df.assign(probabilities_Y=inverseLogit(df.X)) - - # Draw Y ~ Bernoulli(1 - inverseLogit(X)) - # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X) - results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total) - - df = df.assign(result_Y=results) - - return df - - -def thresholdDGWithUnobservables(N_total=50000, - beta_X=1.0, - beta_Z=1.0, - beta_W=0.2): - '''Generates data | Variables: X, Z, W, Y | Outcome by threshold''' - - df = pd.DataFrame() - - # Sample the variables from standard Gaussian distributions. - df = df.assign(X=npr.normal(size=N_total)) - df = df.assign(Z=npr.normal(size=N_total)) - df = df.assign(W=npr.normal(size=N_total)) - - # Calculate P(Y=0|X, Z, W) - probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) - - df = df.assign(probabilities_Y=probabilities_Y) - - # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise - df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1)) - - return df - - -def bernoulliDGWithUnobservables(N_total=50000, - beta_X=1.0, - beta_Z=1.0, - beta_W=0.2): - '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli''' - - df = pd.DataFrame() - - # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1). - df = df.assign(X=npr.normal(size=N_total)) - df = df.assign(Z=npr.normal(size=N_total)) - df = df.assign(W=npr.normal(size=N_total)) - - # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X) - probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) - - df = df.assign(probabilities_Y=probabilities_Y) - - # Draw Y from Bernoulli distribution - results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total) - - df = df.assign(result_Y=results) - - return df - - -# ## Decider modules - -def humanDeciderLakkaraju(df, - featureX_col, - featureZ_col=None, - nJudges_M=100, - beta_X=1, - beta_Z=1, - add_epsilon=True): - '''Decider module | Non-independent batch decisions.''' - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Compute the number of subjects allocated for each judge. - nSubjects_N = int(df.shape[0] / nJudges_M) - - # Assign judge IDs as running numbering from 0 to nJudges_M - 1 - df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) - - # Sample acceptance rates uniformly from a closed interval - # from 0.1 to 0.9 and round to tenth decimal place. - # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all - # values. - acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) - acceptance_rates = np.round(acceptance_rates, 10) - - # Replicate the rates so they can be attached to the corresponding judge ID. - df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) - - if add_epsilon: - epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) - else: - epsilon = 0 - - if featureZ_col is None: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) - else: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + - beta_Z * df[featureZ_col] + epsilon) - - - df = df.assign(probabilities_T=probabilities_T) - - # Sort by judges then probabilities in decreasing order - # Most dangerous for each judge are at the top. - df.sort_values(by=["judgeID_J", "probabilities_T"], - ascending=False, - inplace=True) - - # Iterate over the data. Subject will be given a negative decision - # if they are in the top (1-r)*100% of the individuals the judge will judge. - # I.e. if their within-judge-index is under 1 - acceptance threshold times - # the number of subjects assigned to each judge they will receive a - # negative decision. - df.reset_index(drop=True, inplace=True) - - df['decision_T'] = np.where((df.index.values % nSubjects_N) < - ((1 - df['acceptanceRate_R']) * nSubjects_N), - 0, 1) - - df_labeled = df.copy() - - # Hide unobserved - df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan - - return df_labeled, df - - -def bernoulliDecider(df, - featureX_col, - featureZ_col=None, - nJudges_M=100, - beta_X=1, - beta_Z=1, - add_epsilon=True): - '''Use X and Z to make a decision with probability - P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).''' - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Compute the number of subjects allocated for each judge. - nSubjects_N = int(df.shape[0] / nJudges_M) - - # Assign judge IDs as running numbering from 0 to nJudges_M - 1 - df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) - - if add_epsilon: - epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) - else: - epsilon = 0 - - if featureZ_col is None: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) - else: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + - beta_Z * df[featureZ_col] + epsilon) - - df = df.assign(probabilities_T=probabilities_T) - - # Draw T from Bernoulli distribution - decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0]) - - df = df.assign(decision_T=decisions) - - # Calculate the acceptance rates. - acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values - - # Replicate the rates so they can be attached to the corresponding judge ID. - df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) - - df_labeled = df.copy() - - df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan - - return df_labeled, df - - -def quantileDecider(df, - featureX_col, - featureZ_col=None, - nJudges_M=100, - beta_X=1, - beta_Z=1, - add_epsilon=True): - '''Assign decisions by the value of inverse cumulative distribution function - of the logit-normal distribution at leniency r.''' - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Compute the number of subjects allocated for each judge. - nSubjects_N = int(df.shape[0] / nJudges_M) - - # Assign judge IDs as running numbering from 0 to nJudges_M - 1 - df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) - - # Sample acceptance rates uniformly from a closed interval - # from 0.1 to 0.9 and round to tenth decimal place. - # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all - # values. - acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) - acceptance_rates = np.round(acceptance_rates, 10) - - # Replicate the rates so they can be attached to the corresponding judge ID. - df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) - - if add_epsilon: - epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) - else: - epsilon = 0 - - if featureZ_col is None: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) - - # Compute the bounds straight from the inverse cumulative. - # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2. - df = df.assign(bounds=inverseCumulative( - x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2))) - else: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + - beta_Z * df[featureZ_col] + epsilon) - - # Compute the bounds straight from the inverse cumulative. - # Assuming X and Z are i.i.d standard Gaussians with variance 1. - # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z). - df = df.assign(bounds=inverseCumulative( - x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2))) - - df = df.assign(probabilities_T=probabilities_T) - - # Assign negative decision if the predicted probability (probabilities_T) is - # over the judge's threshold (bounds). - df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1)) - - df_labeled = df.copy() - - df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan - - return df_labeled, df - - -def randomDecider(df, nJudges_M=100, use_acceptance_rates=False): - '''Doesn't use any information about X and Z to make decisions. - - If use_acceptance_rates is False (default) then all decisions are positive - with probability 0.5. If True, probabilities will be sampled from - U(0.1, 0.9) and rounded to tenth decimal place.''' - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Compute the number of subjects allocated for each judge. - nSubjects_N = int(df.shape[0] / nJudges_M) - - # Assign judge IDs as running numbering from 0 to nJudges_M - 1 - df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) - - if use_acceptance_rates: - # Sample acceptance rates uniformly from a closed interval - # from 0.1 to 0.9 and round to tenth decimal place. - acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10) - else: - # No real leniency here -> set to 0.5. - acceptance_rates = np.ones(nJudges_M) * 0.5 - - # Replicate the rates so they can be attached to the corresponding judge ID. - df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) - - df = df.assign( - decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0])) - - df_labeled = df.copy() - - df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan - - return df_labeled, df - - -def biasDecider(df, - featureX_col, - featureZ_col=None, - nJudges_M=100, - beta_X=1, - beta_Z=1, - add_epsilon=True): - ''' - Biased decider: If X > 1, then X <- X * 0.75. People with high X, - get more positive decisions as they should. And if -2 < X -1, then - X <- X + 0.5. People with X in [-2, 1], get less positive decisions - as they should. - - ''' - - # If X > 1, then X <- X * 0.75. People with high X, get more positive - # decisions as they should - df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] * - 0.75, df[featureX_col])) - - # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less - # positive decisions as they should - df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1, - df.biased_X + 0.5, df.biased_X) - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Use quantile decider, but judge by the biased X. - df_labeled, df = humanDeciderLakkaraju(df, - featureX_col='biased_X', - featureZ_col=featureZ_col, - nJudges_M=nJudges_M, - beta_X=beta_X, - beta_Z=beta_Z, - add_epsilon=add_epsilon) - - return df_labeled, df - - -# ## Evaluator modules - -# ### Convenience functions - -def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None): - ''' - Fit a predictive model (default logistic regression) with given training - instances and return probabilities for test instances to obtain a given - class label. - - Arguments: - ---------- - - x_train -- x values of training instances - y_train -- y values of training instances - x_test -- x values of test instances - class_value -- class label for which the probabilities are counted for. - model_type -- type of model to be fitted. - - Returns: - -------- - (1) Trained predictive model - (2) Probabilities for given test inputs for given class. - ''' - - if model_type is None or model_type in ["logistic_regression", "lr"]: - # Instantiate the model (using the default parameters) - logreg = LogisticRegression(solver='lbfgs') - - # Check shape and fit the model. - if x_train.ndim == 1: - logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train) - else: - logreg = logreg.fit(x_train, y_train) - - label_probs_logreg = getProbabilityForClass(x_test, logreg, - class_value) - - return logreg, label_probs_logreg - - elif model_type in ["random_forest", "rf"]: - # Instantiate the model - forest = RandomForestClassifier(n_estimators=100, max_depth=3) - - # Check shape and fit the model. - if x_train.ndim == 1: - forest = forest.fit(x_train.values.reshape(-1, 1), y_train) - else: - forest = forest.fit(x_train, y_train) - - label_probs_forest = getProbabilityForClass(x_test, forest, - class_value) - - return forest, label_probs_forest - - elif model_type == "fully_random": - - label_probs = np.ones_like(x_test) / 2 - - model_object = lambda x: 0.5 - - return model_object, label_probs - else: - raise ValueError("Invalid model_type!", model_type) - - -def getProbabilityForClass(x, model, class_value): - ''' - Function (wrapper) for obtaining the probability of a class given x and a - predictive model. - - Arguments: - ----------- - x -- individual features, an array of shape (observations, features) - model -- a trained sklearn model. Predicts probabilities for given x. - Should accept input of shape (observations, features) - class_value -- the resulting class to predict (usually 0 or 1). - - Returns: - -------- - (1) The probabilities of given class label for each x. - ''' - if x.ndim == 1: - # if x is vector, transform to column matrix. - f_values = model.predict_proba(np.array(x).reshape(-1, 1)) - else: - f_values = model.predict_proba(x) - - # Get correct column of predicted class, remove extra dimensions and return. - return f_values[:, model.classes_ == class_value].flatten() - - -def cdf(x_0, model, class_value): - ''' - Cumulative distribution function as described above. Integral is - approximated using Simpson's rule for efficiency. - - Arguments: - ---------- - - x_0 -- private features of an instance for which the value of cdf is to be - calculated. - model -- a trained sklearn model. Predicts probabilities for given x. - Should accept input of shape (observations, features) - class_value -- the resulting class to predict (usually 0 or 1). - - ''' - - def prediction(x): - return getProbabilityForClass( - np.array([x]).reshape(-1, 1), model, class_value) - - prediction_x_0 = prediction(x_0) - - x_values = np.linspace(-15, 15, 40000) - - x_preds = prediction(x_values) - - y_values = scs.norm.pdf(x_values) - - results = np.zeros(x_0.shape[0]) - - for i in range(x_0.shape[0]): - - y_copy = y_values.copy() - - y_copy[x_preds > prediction_x_0[i]] = 0 - - results[i] = si.simps(y_copy, x=x_values) - - return results - -# ### Contraction algorithm -# -# Below is an implementation of Lakkaraju's team's algorithm presented in -# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant -# parameters to be passed to the function are presented in the description. - -def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, - accRateR_col, r): - ''' - This is an implementation of the algorithm presented by Lakkaraju - et al. in their paper "The Selective Labels Problem: Evaluating - Algorithmic Predictions in the Presence of Unobservables" (2017). - - Arguments: - ---------- - df -- The (Pandas) data frame containing the data, judge decisions, - judge IDs, results and probability scores. - judgeIDJ_col -- String, the name of the column containing the judges' IDs - in df. - decisionT_col -- String, the name of the column containing the judges' decisions - resultY_col -- String, the name of the column containing the realization - modelProbS_col -- String, the name of the column containing the probability - scores from the black-box model B. - accRateR_col -- String, the name of the column containing the judges' - acceptance rates - r -- Float between 0 and 1, the given acceptance rate. - - Returns: - -------- - (1) The estimated failure rate at acceptance rate r. - ''' - # Get ID of the most lenient judge. - most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()] - - # Subset. "D_q is the set of all observations judged by q." - D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() - - # All observations of R_q have observed outcome labels. - # "R_q is the set of observations in D_q with observed outcome labels." - R_q = D_q[D_q[decisionT_col] == 1].copy() - - # Sort observations in R_q in descending order of confidence scores S and - # assign to R_sort_q. - # "Observations deemed as high risk by B are at the top of this list" - R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False) - - number_to_remove = int( - round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0]))) - - # "R_B is the list of observations assigned to t = 1 by B" - R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]] - - return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0] - - -# ### Evaluators - -def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r): - - df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(df.shape[0] * r)) - - failed = df[resultY_col][0:to_release] == 0 - - return np.sum(failed) / df.shape[0], df.shape[0] - - -def labeledOutcomesEvaluator(df, - featureX_col, - decisionT_col, - resultY_col, - r, - adjusted=False): - - df_observed = df.loc[df[decisionT_col] == 1, :] - - df_observed = df_observed.sort_values(by='B_prob_0_model', - inplace=False, - ascending=True) - - to_release = int(round(df_observed.shape[0] * r)) - - failed = df_observed[resultY_col][0:to_release] == 0 - - if adjusted: - return np.mean(failed), df.shape[0] - - return np.sum(failed) / df.shape[0], df.shape[0] - - -def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, - accRateR_col, r): - - # Get judges with correct leniency as list - is_correct_leniency = df[accRateR_col].round(1) == r - - # No judges with correct leniency - if np.sum(is_correct_leniency) == 0: - return np.nan, np.nan - - correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col] - - # Released are the people they judged and released, T = 1 - released = df[df[judgeIDJ_col].isin(correct_leniency_list) - & (df[decisionT_col] == 1)] - - failed = released[resultY_col] == 0 - - # Get their failure rate, aka ratio of reoffenders to number of people judged in total - return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0] - - -def monteCarloEvaluator(df, - featureX_col, - decisionT_col, - resultY_col, - accRateR_col, - r, - mu_X=0, - mu_Z=0, - beta_X=1, - beta_Z=1, - sigma_X=1, - sigma_Z=1): - - # Compute the predicted/assumed decision bounds for all the judges. - q_r = inverseCumulative(x=df[accRateR_col], - mu=mu_X + mu_Z, - sigma=np.sqrt((beta_X * sigma_X)**2 + - (beta_Z * sigma_Z)**2)) - - df = df.assign(bounds=logit(q_r) - df[featureX_col]) - - # Compute the expectation of Z when it is known to come from truncated - # Gaussian. - alphabeta = (df.bounds - mu_Z) / (sigma_Z) - - Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z) # 1 - cdf(ab) - - # E(Z | Z > a). Expectation of Z if negative decision. - exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_ - - # E(Z | Z < b). Expectation of Z if positive decision. - exp_upper_trunc = mu_Z - ( - sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta) - - exp_Z = (1 - df[decisionT_col] - ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc - - # Attach the predicted probability for Y=0 to data. - df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z)) - - # Predictions drawn from binomial. - predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0]) - - df[resultY_col] = np.where(df[decisionT_col] == 0, predictions, - df[resultY_col]) - - df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(df.shape[0] * r)) - - failed = df[resultY_col][0:to_release] == 0 - - return np.sum(failed) / df.shape[0], df.shape[0] - - - -def perfComp(dgModule, deciderModule, title, save_name): - failure_rates = np.zeros((8, 7)) - error_Ns = np.zeros((8, 7)) - - # Create data - df = dgModule() - - # Decicions - df_labeled, df_unlabeled = deciderModule(df) - - # Split data - train, test_labeled = train_test_split(df_labeled, test_size=0.5) - - # Assign same observations to unlabeled dat - test_unlabeled = df_unlabeled.iloc[test_labeled.index.values] - - # Train model - B_model, predictions = fitPredictiveModel( - train.loc[train['decision_T'] == 1, 'X'], - train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], 0, model_type=model_type) - - # Attach predictions to data - test_labeled = test_labeled.assign(B_prob_0_model=predictions) - test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions) - - test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False) - - # Find observed values - observed = test_labeled['decision_T'] == 1 - - # Assign data to the model - dat = dict(D=1, - N_obs=np.sum(observed), - N_cens=np.sum(~observed), - K=group_amount, - sigma_tau=sigma_tau, - M=len(set(df_unlabeled['judgeID_J'])), - jj_obs=test_labeled.loc[observed, 'judgeID_J']+1, - jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1, - kk_obs=kk_array[observed]+1, - kk_cens=kk_array[~observed]+1, - dec_obs=test_labeled.loc[observed, 'decision_T'], - dec_cens=test_labeled.loc[~observed, 'decision_T'], - X_obs=test_labeled.loc[observed, 'X'].values.reshape(-1,1), - X_cens=test_labeled.loc[~observed, 'X'].values.reshape(-1,1), - y_obs=test_labeled.loc[observed, 'result_Y'].astype(int)) - - fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9)) - - pars = fit.extract() - - plt.figure(figsize=(15,30)) - - fit.plot(); - - plt.savefig(save_name + '_stan_diagnostic_plot') - - plt.show() - plt.close('all') - - print(fit, file=open(save_name + '_stan_fit_diagnostics.txt', 'w')) - - # Bayes - - # Alusta matriisi, rivillä yksi otos posteriorista - # sarakkeet havaintoja - y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0])) - - # Täydennetään havaitsemattomat estimoiduilla - y_imp[:, ~observed] = 1 - pars['y_est'] - - # Täydennetään havaitut havaituilla - y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y'] - - Rs = np.arange(.1, .9, .1) - - to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int) - - for i in range(len(to_release_list)): - - failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) - - est_failure_rates = failed / test_labeled.shape[0] - - failure_rates[i, 6] = np.mean(est_failure_rates) - - error_Ns[i, 6] = test_labeled.shape[0] - - for r in range(1, 9): - - print(".", end="") - - # True evaluation - - FR, N = trueEvaluationEvaluator(test_unlabeled, 'X', 'decision_T', - 'result_Y', r / 10) - - failure_rates[r - 1, 0] = FR - error_Ns[r - 1, 0] = N - - # Labeled outcomes only - - FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T', - 'result_Y', r / 10) - - failure_rates[r - 1, 1] = FR - error_Ns[r - 1, 1] = N - - # Adjusted labeled outcomes - - FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T', - 'result_Y', r / 10, adjusted=True) - - failure_rates[r - 1, 2] = FR - error_Ns[r - 1, 2] = N - - # Human evaluation - - FR, N = humanEvaluationEvaluator(test_labeled, 'judgeID_J', - 'decision_T', 'result_Y', - 'acceptanceRate_R', r / 10) - - failure_rates[r - 1, 3] = FR - error_Ns[r - 1, 3] = N - - # Contraction - - FR, N = contraction(test_labeled, 'judgeID_J', 'decision_T', - 'result_Y', 'B_prob_0_model', 'acceptanceRate_R', r / 10) - - failure_rates[r - 1, 4] = FR - error_Ns[r - 1, 4] = N - - # Causal model - analytic solution - - FR, N = monteCarloEvaluator(test_labeled, 'X', 'decision_T', - 'result_Y', 'acceptanceRate_R', r / 10) - - failure_rates[r - 1, 5] = FR - error_Ns[r - 1, 5] = N - - failure_SEs = standardError(failure_rates, error_Ns) - - x_ax = np.arange(0.1, 0.9, 0.1) - - labels = [ - 'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.', - 'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes' - ] - colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c'] - - for i in range(failure_rates.shape[1]): - plt.errorbar(x_ax, - failure_rates[:, i], - label=labels[i], - c=colours[i], - yerr=failure_SEs[:, i]) - - plt.title('Failure rate vs. Acceptance rate') - plt.xlabel('Acceptance rate') - plt.ylabel('Failure rate') - plt.legend() - plt.grid() - - plt.savefig(save_name + '_all') - - plt.show() - - print("\nFailure rates:") - print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) - - print("\nMean absolute errors:") - for i in range(1, failure_rates.shape[1]): - print( - labels[i].ljust(len(max(labels, key=len))), - np.round( - np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5)) - - -sm = pystan.StanModel(file=stan_code_file_name) - -if which == 1: - print("\nWithout unobservables (Bernoulli + independent decisions)") - - dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim) - - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations\n" + - "Bernoulli + independent decisions, without unobservables", - figure_path + "sl_bernoulli_independent_without_Z" - ) - -gc.collect() -plt.close('all') - -print("\nWith unobservables in the data") - -if which == 2: - print("\nBernoulli + independent decisions") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Bernoulli + independent decisions, with unobservables", - figure_path + "sl_bernoulli_independent_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 3: - print("\nThreshold rule + independent decisions") - - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim) - - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Threshold rule + independent decisions, with unobservables", - figure_path + "sl_threshold_independent_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 4: - print("\nBernoulli + non-independent (batch) decisions") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Bernoulli + non-independent decisions, with unobservables", - figure_path + "sl_bernoulli_batch_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 5: - print("\nThreshold rule + non-independent (batch) decisions") - - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim) - - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Threshold rule + non-independent decisions, with unobservables", - figure_path + "sl_threshold_batch_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 6: - print("\nRandom decider") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: randomDecider( - x, nJudges_M=M_sim, use_acceptance_rates=True) - - perfComp( - dg, lambda x: decider(x), - "Bernoulli + random decider with leniency and unobservables", - figure_path + "sl_random_decider_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 7: - print("\nBiased decider") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Bernoulli + biased decider with leniency and unobservables", - figure_path + "sl_biased_decider_with_Z", - ) - - -if which == 8: - print("\nBad judge") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim) - - perfComp( - dg, lambda x: decider(x), - "Bernoulli + 'bad' decider with leniency and unobservables", - figure_path + "sl_bad_decider_with_Z" - ) - -gc.collect() -plt.close('all') - -if which == 9: - print("\nBernoulli + Bernoulli") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim) - - perfComp( - dg, lambda x: decider(x), - "Bernoulli + Bernoulli", - figure_path + "sl_bernoulli_bernoulli_with_Z", - ) - -if which == 10: - print("\nBeta_Z = 3, Threshold + batch") - - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0) - - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Beta_Z = 3, threshold + batch", - figure_path + "sl_threshold_batch_beta_Z_3_with_Z", - ) - -if which == 11: - print("\nBeta_Z = 5, Threshold + batch") - - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0) - - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Beta_Z = 5, threshold + batch", - figure_path + "sl_threshold_batch_beta_Z_5_with_Z", - ) \ No newline at end of file diff --git a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py b/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py deleted file mode 100644 index 00711de6fae4e654b2107812690a2c82c9164fc8..0000000000000000000000000000000000000000 --- a/analysis_and_scripts/stan_modelling_theoretic_SEfixed_useprediction.py +++ /dev/null @@ -1,1101 +0,0 @@ -''' -# Author: Riku Laine -# Date: 25JUL2019 (start) -# Project name: Potential outcomes in model evaluation -# Description: This script creates the figures and results used -# in synthetic data experiments. -# -# Parameters: -# ----------- -# (1) figure_path : file name for saving the created figures. -# (2) N_sim : Size of simulated data set. -# (3) M_sim : Number of judges in simulated data, -# N_sim must be divisible by M_sim! -# (4) which : Which data + outcome analysis should be performed. -# (5) group_amount : How many groups if Jung-inspired model is used. -# (6) stan_code_file_name : Name of file containing the stan model code. -# (7) sigma_tau : Values of prior variance for the Jung-inspired model. -# (8) model_type : What model type to be fitted. Options: -# - "lr" : logistic regression -# - "rf" : random forest -# - "fully_random" : Fully random, all predictions will be 0.5. -# -''' -# Refer to the `notes.tex` file for explanations about the modular framework. - -# Imports - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import scipy.stats as scs -import scipy.special as ssp -import scipy.integrate as si -import numpy.random as npr -from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split -import pystan -import gc - -plt.switch_backend('agg') - -import sys - -# figure storage name -figure_path = sys.argv[1] - -# Size of simulated data set -N_sim = int(sys.argv[2]) - -# Number of judges in simulated data, N_sim must be divisible by M_sim! -M_sim = int(sys.argv[3]) - -# Which data + outcome generation should be performed. -which = int(sys.argv[4]) - -# How many groups if jung model is used -group_amount = int(sys.argv[5]) - -# Name of stan model code file -stan_code_file_name = sys.argv[6] - -# Variance prior -sigma_tau = float(sys.argv[7]) - -# Type of model to be fitted -model_type = sys.argv[8] - -# Settings - -plt.rcParams.update({'font.size': 16}) -plt.rcParams.update({'figure.figsize': (10, 6)}) - -print("These results have been obtained with the following settings:") - -print("Number of observations in the simulated data:", N_sim) - -print("Number of judges in the simulated data:", M_sim) - -print("Number of groups:", group_amount) - -print("Prior for the variances:", sigma_tau) - -# Basic functions - - -def inverseLogit(x): - return 1.0 / (1.0 + np.exp(-1.0 * x)) - - -def logit(x): - return np.log(x) - np.log(1.0 - x) - - -def inverseCumulative(x, mu, sigma): - '''Compute the inverse of the cumulative distribution of logit-normal - distribution at x with parameters mu and sigma (mean and st.dev.).''' - - return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu) - -def standardError(p, n): - denominator = p * (1 - p) - - return np.sqrt(denominator / n) - - -# ## Data generation modules - -def bernoulliDGWithoutUnobservables(N_total=50000): - '''Generates data | Variables: X, Y | Outcome from Bernoulli''' - - df = pd.DataFrame() - - # Sample feature X from standard Gaussian distribution, N(0, 1). - df = df.assign(X=npr.normal(size=N_total)) - - # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X) - df = df.assign(probabilities_Y=inverseLogit(df.X)) - - # Draw Y ~ Bernoulli(1 - inverseLogit(X)) - # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X) - results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total) - - df = df.assign(result_Y=results) - - return df - - -def thresholdDGWithUnobservables(N_total=50000, - beta_X=1.0, - beta_Z=1.0, - beta_W=0.2): - '''Generates data | Variables: X, Z, W, Y | Outcome by threshold''' - - df = pd.DataFrame() - - # Sample the variables from standard Gaussian distributions. - df = df.assign(X=npr.normal(size=N_total)) - df = df.assign(Z=npr.normal(size=N_total)) - df = df.assign(W=npr.normal(size=N_total)) - - # Calculate P(Y=0|X, Z, W) - probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) - - df = df.assign(probabilities_Y=probabilities_Y) - - # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise - df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1)) - - return df - - -def bernoulliDGWithUnobservables(N_total=50000, - beta_X=1.0, - beta_Z=1.0, - beta_W=0.2): - '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli''' - - df = pd.DataFrame() - - # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1). - df = df.assign(X=npr.normal(size=N_total)) - df = df.assign(Z=npr.normal(size=N_total)) - df = df.assign(W=npr.normal(size=N_total)) - - # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X) - probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W) - - df = df.assign(probabilities_Y=probabilities_Y) - - # Draw Y from Bernoulli distribution - results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total) - - df = df.assign(result_Y=results) - - return df - - -# ## Decider modules - -def humanDeciderLakkaraju(df, - featureX_col, - featureZ_col=None, - nJudges_M=100, - beta_X=1, - beta_Z=1, - add_epsilon=True): - '''Decider module | Non-independent batch decisions.''' - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Compute the number of subjects allocated for each judge. - nSubjects_N = int(df.shape[0] / nJudges_M) - - # Assign judge IDs as running numbering from 0 to nJudges_M - 1 - df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) - - # Sample acceptance rates uniformly from a closed interval - # from 0.1 to 0.9 and round to tenth decimal place. - # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all - # values. - acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) - acceptance_rates = np.round(acceptance_rates, 10) - - # Replicate the rates so they can be attached to the corresponding judge ID. - df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) - - if add_epsilon: - epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) - else: - epsilon = 0 - - if featureZ_col is None: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) - else: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + - beta_Z * df[featureZ_col] + epsilon) - - - df = df.assign(probabilities_T=probabilities_T) - - # Sort by judges then probabilities in decreasing order - # Most dangerous for each judge are at the top. - df.sort_values(by=["judgeID_J", "probabilities_T"], - ascending=False, - inplace=True) - - # Iterate over the data. Subject will be given a negative decision - # if they are in the top (1-r)*100% of the individuals the judge will judge. - # I.e. if their within-judge-index is under 1 - acceptance threshold times - # the number of subjects assigned to each judge they will receive a - # negative decision. - df.reset_index(drop=True, inplace=True) - - df['decision_T'] = np.where((df.index.values % nSubjects_N) < - ((1 - df['acceptanceRate_R']) * nSubjects_N), - 0, 1) - - df_labeled = df.copy() - - # Hide unobserved - df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan - - return df_labeled, df - - -def bernoulliDecider(df, - featureX_col, - featureZ_col=None, - nJudges_M=100, - beta_X=1, - beta_Z=1, - add_epsilon=True): - '''Use X and Z to make a decision with probability - P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).''' - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Compute the number of subjects allocated for each judge. - nSubjects_N = int(df.shape[0] / nJudges_M) - - # Assign judge IDs as running numbering from 0 to nJudges_M - 1 - df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) - - if add_epsilon: - epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) - else: - epsilon = 0 - - if featureZ_col is None: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) - else: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + - beta_Z * df[featureZ_col] + epsilon) - - df = df.assign(probabilities_T=probabilities_T) - - # Draw T from Bernoulli distribution - decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0]) - - df = df.assign(decision_T=decisions) - - # Calculate the acceptance rates. - acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values - - # Replicate the rates so they can be attached to the corresponding judge ID. - df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) - - df_labeled = df.copy() - - df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan - - return df_labeled, df - - -def quantileDecider(df, - featureX_col, - featureZ_col=None, - nJudges_M=100, - beta_X=1, - beta_Z=1, - add_epsilon=True): - '''Assign decisions by the value of inverse cumulative distribution function - of the logit-normal distribution at leniency r.''' - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Compute the number of subjects allocated for each judge. - nSubjects_N = int(df.shape[0] / nJudges_M) - - # Assign judge IDs as running numbering from 0 to nJudges_M - 1 - df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) - - # Sample acceptance rates uniformly from a closed interval - # from 0.1 to 0.9 and round to tenth decimal place. - # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all - # values. - acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9) - acceptance_rates = np.round(acceptance_rates, 10) - - # Replicate the rates so they can be attached to the corresponding judge ID. - df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) - - if add_epsilon: - epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0]) - else: - epsilon = 0 - - if featureZ_col is None: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon) - - # Compute the bounds straight from the inverse cumulative. - # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2. - df = df.assign(bounds=inverseCumulative( - x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2))) - else: - probabilities_T = inverseLogit(beta_X * df[featureX_col] + - beta_Z * df[featureZ_col] + epsilon) - - # Compute the bounds straight from the inverse cumulative. - # Assuming X and Z are i.i.d standard Gaussians with variance 1. - # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z). - df = df.assign(bounds=inverseCumulative( - x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2))) - - df = df.assign(probabilities_T=probabilities_T) - - # Assign negative decision if the predicted probability (probabilities_T) is - # over the judge's threshold (bounds). - df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1)) - - df_labeled = df.copy() - - df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan - - return df_labeled, df - - -def randomDecider(df, nJudges_M=100, use_acceptance_rates=False): - '''Doesn't use any information about X and Z to make decisions. - - If use_acceptance_rates is False (default) then all decisions are positive - with probability 0.5. If True, probabilities will be sampled from - U(0.1, 0.9) and rounded to tenth decimal place.''' - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Compute the number of subjects allocated for each judge. - nSubjects_N = int(df.shape[0] / nJudges_M) - - # Assign judge IDs as running numbering from 0 to nJudges_M - 1 - df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N)) - - if use_acceptance_rates: - # Sample acceptance rates uniformly from a closed interval - # from 0.1 to 0.9 and round to tenth decimal place. - acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10) - else: - # No real leniency here -> set to 0.5. - acceptance_rates = np.ones(nJudges_M) * 0.5 - - # Replicate the rates so they can be attached to the corresponding judge ID. - df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N)) - - df = df.assign( - decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0])) - - df_labeled = df.copy() - - df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan - - return df_labeled, df - - -def biasDecider(df, - featureX_col, - featureZ_col=None, - nJudges_M=100, - beta_X=1, - beta_Z=1, - add_epsilon=True): - ''' - Biased decider: If X > 1, then X <- X * 0.75. People with high X, - get more positive decisions as they should. And if -2 < X -1, then - X <- X + 0.5. People with X in [-2, 1], get less positive decisions - as they should. - - ''' - - # If X > 1, then X <- X * 0.75. People with high X, get more positive - # decisions as they should - df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] * - 0.75, df[featureX_col])) - - # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less - # positive decisions as they should - df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1, - df.biased_X + 0.5, df.biased_X) - - # Assert that every judge will have the same number of subjects. - assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!" - - # Use quantile decider, but judge by the biased X. - df_labeled, df = humanDeciderLakkaraju(df, - featureX_col='biased_X', - featureZ_col=featureZ_col, - nJudges_M=nJudges_M, - beta_X=beta_X, - beta_Z=beta_Z, - add_epsilon=add_epsilon) - - return df_labeled, df - - -# ## Evaluator modules - -# ### Convenience functions - -def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None): - ''' - Fit a predictive model (default logistic regression) with given training - instances and return probabilities for test instances to obtain a given - class label. - - Arguments: - ---------- - - x_train -- x values of training instances - y_train -- y values of training instances - x_test -- x values of test instances - class_value -- class label for which the probabilities are counted for. - model_type -- type of model to be fitted. - - Returns: - -------- - (1) Trained predictive model - (2) Probabilities for given test inputs for given class. - ''' - - if model_type is None or model_type in ["logistic_regression", "lr"]: - # Instantiate the model (using the default parameters) - logreg = LogisticRegression(solver='lbfgs') - - # Check shape and fit the model. - if x_train.ndim == 1: - logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train) - else: - logreg = logreg.fit(x_train, y_train) - - label_probs_logreg = getProbabilityForClass(x_test, logreg, - class_value) - - return logreg, label_probs_logreg - - elif model_type in ["random_forest", "rf"]: - # Instantiate the model - forest = RandomForestClassifier(n_estimators=100, max_depth=3) - - # Check shape and fit the model. - if x_train.ndim == 1: - forest = forest.fit(x_train.values.reshape(-1, 1), y_train) - else: - forest = forest.fit(x_train, y_train) - - label_probs_forest = getProbabilityForClass(x_test, forest, - class_value) - - return forest, label_probs_forest - - elif model_type == "fully_random": - - label_probs = np.ones_like(x_test) / 2 - - model_object = lambda x: 0.5 - - return model_object, label_probs - else: - raise ValueError("Invalid model_type!", model_type) - - -def getProbabilityForClass(x, model, class_value): - ''' - Function (wrapper) for obtaining the probability of a class given x and a - predictive model. - - Arguments: - ----------- - x -- individual features, an array of shape (observations, features) - model -- a trained sklearn model. Predicts probabilities for given x. - Should accept input of shape (observations, features) - class_value -- the resulting class to predict (usually 0 or 1). - - Returns: - -------- - (1) The probabilities of given class label for each x. - ''' - if x.ndim == 1: - # if x is vector, transform to column matrix. - f_values = model.predict_proba(np.array(x).reshape(-1, 1)) - else: - f_values = model.predict_proba(x) - - # Get correct column of predicted class, remove extra dimensions and return. - return f_values[:, model.classes_ == class_value].flatten() - - -def cdf(x_0, model, class_value): - ''' - Cumulative distribution function as described above. Integral is - approximated using Simpson's rule for efficiency. - - Arguments: - ---------- - - x_0 -- private features of an instance for which the value of cdf is to be - calculated. - model -- a trained sklearn model. Predicts probabilities for given x. - Should accept input of shape (observations, features) - class_value -- the resulting class to predict (usually 0 or 1). - - ''' - - def prediction(x): - return getProbabilityForClass( - np.array([x]).reshape(-1, 1), model, class_value) - - prediction_x_0 = prediction(x_0) - - x_values = np.linspace(-15, 15, 40000) - - x_preds = prediction(x_values) - - y_values = scs.norm.pdf(x_values) - - results = np.zeros(x_0.shape[0]) - - for i in range(x_0.shape[0]): - - y_copy = y_values.copy() - - y_copy[x_preds > prediction_x_0[i]] = 0 - - results[i] = si.simps(y_copy, x=x_values) - - return results - -# ### Contraction algorithm -# -# Below is an implementation of Lakkaraju's team's algorithm presented in -# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant -# parameters to be passed to the function are presented in the description. - -def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col, - accRateR_col, r): - ''' - This is an implementation of the algorithm presented by Lakkaraju - et al. in their paper "The Selective Labels Problem: Evaluating - Algorithmic Predictions in the Presence of Unobservables" (2017). - - Arguments: - ---------- - df -- The (Pandas) data frame containing the data, judge decisions, - judge IDs, results and probability scores. - judgeIDJ_col -- String, the name of the column containing the judges' IDs - in df. - decisionT_col -- String, the name of the column containing the judges' decisions - resultY_col -- String, the name of the column containing the realization - modelProbS_col -- String, the name of the column containing the probability - scores from the black-box model B. - accRateR_col -- String, the name of the column containing the judges' - acceptance rates - r -- Float between 0 and 1, the given acceptance rate. - - Returns: - -------- - (1) The estimated failure rate at acceptance rate r. - ''' - # Get ID of the most lenient judge. - most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()] - - # Subset. "D_q is the set of all observations judged by q." - D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy() - - # All observations of R_q have observed outcome labels. - # "R_q is the set of observations in D_q with observed outcome labels." - R_q = D_q[D_q[decisionT_col] == 1].copy() - - # Sort observations in R_q in descending order of confidence scores S and - # assign to R_sort_q. - # "Observations deemed as high risk by B are at the top of this list" - R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False) - - number_to_remove = int( - round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0]))) - - # "R_B is the list of observations assigned to t = 1 by B" - R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]] - - return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0] - - -# ### Evaluators - -def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r): - - df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(df.shape[0] * r)) - - failed = df[resultY_col][0:to_release] == 0 - - return np.sum(failed) / df.shape[0], df.shape[0] - - -def labeledOutcomesEvaluator(df, - featureX_col, - decisionT_col, - resultY_col, - r, - adjusted=False): - - df_observed = df.loc[df[decisionT_col] == 1, :] - - df_observed = df_observed.sort_values(by='B_prob_0_model', - inplace=False, - ascending=True) - - to_release = int(round(df_observed.shape[0] * r)) - - failed = df_observed[resultY_col][0:to_release] == 0 - - if adjusted: - return np.mean(failed), df.shape[0] - - return np.sum(failed) / df.shape[0], df.shape[0] - - -def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col, - accRateR_col, r): - - # Get judges with correct leniency as list - is_correct_leniency = df[accRateR_col].round(1) == r - - # No judges with correct leniency - if np.sum(is_correct_leniency) == 0: - return np.nan, np.nan - - correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col] - - # Released are the people they judged and released, T = 1 - released = df[df[judgeIDJ_col].isin(correct_leniency_list) - & (df[decisionT_col] == 1)] - - failed = released[resultY_col] == 0 - - # Get their failure rate, aka ratio of reoffenders to number of people judged in total - return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0] - - -def monteCarloEvaluator(df, - featureX_col, - decisionT_col, - resultY_col, - accRateR_col, - r, - mu_X=0, - mu_Z=0, - beta_X=1, - beta_Z=1, - sigma_X=1, - sigma_Z=1): - - # Compute the predicted/assumed decision bounds for all the judges. - q_r = inverseCumulative(x=df[accRateR_col], - mu=mu_X + mu_Z, - sigma=np.sqrt((beta_X * sigma_X)**2 + - (beta_Z * sigma_Z)**2)) - - df = df.assign(bounds=logit(q_r) - df[featureX_col]) - - # Compute the expectation of Z when it is known to come from truncated - # Gaussian. - alphabeta = (df.bounds - mu_Z) / (sigma_Z) - - Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z) # 1 - cdf(ab) - - # E(Z | Z > a). Expectation of Z if negative decision. - exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_ - - # E(Z | Z < b). Expectation of Z if positive decision. - exp_upper_trunc = mu_Z - ( - sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta) - - exp_Z = (1 - df[decisionT_col] - ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc - - # Attach the predicted probability for Y=0 to data. - df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z)) - - # Predictions drawn from binomial. - predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0]) - - df[resultY_col] = np.where(df[decisionT_col] == 0, predictions, - df[resultY_col]) - - df.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - to_release = int(round(df.shape[0] * r)) - - failed = df[resultY_col][0:to_release] == 0 - - return np.sum(failed) / df.shape[0], df.shape[0] - - - -def perfComp(dgModule, deciderModule, title, save_name): - failure_rates = np.zeros((8, 7)) - error_Ns = np.zeros((8, 7)) - - # Create data - df = dgModule() - - # Decicions - df_labeled, df_unlabeled = deciderModule(df) - - # Split data - train, test_labeled = train_test_split(df_labeled, test_size=0.5) - - # Assign same observations to unlabeled dat - test_unlabeled = df_unlabeled.iloc[test_labeled.index.values] - - # Train model - B_model, predictions = fitPredictiveModel( - train.loc[train['decision_T'] == 1, 'X'], - train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], 0, model_type=model_type) - - # Attach predictions to data - test_labeled = test_labeled.assign(B_prob_0_model=predictions) - test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions) - - test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True) - - kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False) - - # Find observed values - observed = test_labeled['decision_T'] == 1 - - # Assign data to the model - dat = dict(D=1, - N_obs=np.sum(observed), - N_cens=np.sum(~observed), - K=group_amount, - sigma_tau=sigma_tau, - M=len(set(df_unlabeled['judgeID_J'])), - jj_obs=test_labeled.loc[observed, 'judgeID_J']+1, - jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1, - kk_obs=kk_array[observed]+1, - kk_cens=kk_array[~observed]+1, - dec_obs=test_labeled.loc[observed, 'decision_T'], - dec_cens=test_labeled.loc[~observed, 'decision_T'], - X_obs=test_labeled.loc[observed, 'B_prob_0_model'].values.reshape(-1,1), - X_cens=test_labeled.loc[~observed, 'B_prob_0_model'].values.reshape(-1,1), - y_obs=test_labeled.loc[observed, 'result_Y'].astype(int)) - - fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9)) - - pars = fit.extract() - - plt.figure(figsize=(15,30)) - - fit.plot(); - - plt.savefig(save_name + '_stan_diagnostic_plot') - - plt.show() - plt.close('all') - - print(fit, file=open(save_name + '_stan_fit_diagnostics.txt', 'w')) - - # Bayes - - # Alusta matriisi, rivillä yksi otos posteriorista - # sarakkeet havaintoja - y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0])) - - # Täydennetään havaitsemattomat estimoiduilla - y_imp[:, ~observed] = 1 - pars['y_est'] - - # Täydennetään havaitut havaituilla - y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y'] - - Rs = np.arange(.1, .9, .1) - - to_release_list = np.round(test_labeled.shape[0] * Rs).astype(int) - - for i in range(len(to_release_list)): - - failed = np.sum(y_imp[:, 0:to_release_list[i]], axis=1) - - est_failure_rates = failed / test_labeled.shape[0] - - failure_rates[i, 6] = np.mean(est_failure_rates) - - error_Ns[i, 6] = test_labeled.shape[0] - - for r in range(1, 9): - - print(".", end="") - - # True evaluation - - FR, N = trueEvaluationEvaluator(test_unlabeled, 'X', 'decision_T', - 'result_Y', r / 10) - - failure_rates[r - 1, 0] = FR - error_Ns[r - 1, 0] = N - - # Labeled outcomes only - - FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T', - 'result_Y', r / 10) - - failure_rates[r - 1, 1] = FR - error_Ns[r - 1, 1] = N - - # Adjusted labeled outcomes - - FR, N = labeledOutcomesEvaluator(test_labeled, 'X', 'decision_T', - 'result_Y', r / 10, adjusted=True) - - failure_rates[r - 1, 2] = FR - error_Ns[r - 1, 2] = N - - # Human evaluation - - FR, N = humanEvaluationEvaluator(test_labeled, 'judgeID_J', - 'decision_T', 'result_Y', - 'acceptanceRate_R', r / 10) - - failure_rates[r - 1, 3] = FR - error_Ns[r - 1, 3] = N - - # Contraction - - FR, N = contraction(test_labeled, 'judgeID_J', 'decision_T', - 'result_Y', 'B_prob_0_model', 'acceptanceRate_R', r / 10) - - failure_rates[r - 1, 4] = FR - error_Ns[r - 1, 4] = N - - # Causal model - analytic solution - - FR, N = monteCarloEvaluator(test_labeled, 'X', 'decision_T', - 'result_Y', 'acceptanceRate_R', r / 10) - - failure_rates[r - 1, 5] = FR - error_Ns[r - 1, 5] = N - - failure_SEs = standardError(failure_rates, error_Ns) - - x_ax = np.arange(0.1, 0.9, 0.1) - - labels = [ - 'True Evaluation', 'Labeled outcomes', 'Labeled outcomes, adj.', - 'Human evaluation', 'Contraction', 'Analytic solution', 'Potential outcomes' - ] - colours = ['g', 'magenta', 'darkviolet', 'r', 'b', 'k', 'c'] - - for i in range(failure_rates.shape[1]): - plt.errorbar(x_ax, - failure_rates[:, i], - label=labels[i], - c=colours[i], - yerr=failure_SEs[:, i]) - - plt.title('Failure rate vs. Acceptance rate') - plt.xlabel('Acceptance rate') - plt.ylabel('Failure rate') - plt.legend() - plt.grid() - - plt.savefig(save_name + '_all') - - plt.show() - - print("\nFailure rates:") - print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x})) - - print("\nMean absolute errors:") - for i in range(1, failure_rates.shape[1]): - print( - labels[i].ljust(len(max(labels, key=len))), - np.round( - np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5)) - - -sm = pystan.StanModel(file=stan_code_file_name) - -if which == 1: - print("\nWithout unobservables (Bernoulli + independent decisions)") - - dg = lambda: bernoulliDGWithoutUnobservables(N_total=N_sim) - - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col=None, nJudges_M=M_sim, beta_X=1, beta_Z=1) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations\n" + - "Bernoulli + independent decisions, without unobservables", - figure_path + "sl_bernoulli_independent_without_Z" - ) - -gc.collect() -plt.close('all') - -print("\nWith unobservables in the data") - -if which == 2: - print("\nBernoulli + independent decisions") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Bernoulli + independent decisions, with unobservables", - figure_path + "sl_bernoulli_independent_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 3: - print("\nThreshold rule + independent decisions") - - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim) - - decider = lambda x: quantileDecider( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Threshold rule + independent decisions, with unobservables", - figure_path + "sl_threshold_independent_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 4: - print("\nBernoulli + non-independent (batch) decisions") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Bernoulli + non-independent decisions, with unobservables", - figure_path + "sl_bernoulli_batch_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 5: - print("\nThreshold rule + non-independent (batch) decisions") - - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim) - - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=1, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Fluctuation of failure rate estimates across iterations \n" + - "Threshold rule + non-independent decisions, with unobservables", - figure_path + "sl_threshold_batch_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 6: - print("\nRandom decider") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: randomDecider( - x, nJudges_M=M_sim, use_acceptance_rates=True) - - perfComp( - dg, lambda x: decider(x), - "Bernoulli + random decider with leniency and unobservables", - figure_path + "sl_random_decider_with_Z", - ) - -gc.collect() -plt.close('all') - -if which == 7: - print("\nBiased decider") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: biasDecider(x, 'X', 'Z', add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Bernoulli + biased decider with leniency and unobservables", - figure_path + "sl_biased_decider_with_Z", - ) - - -if which == 8: - print("\nBad judge") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: quantileDecider(x, 'X', 'Z', beta_X=0.2, add_epsilon=True, nJudges_M=M_sim) - - perfComp( - dg, lambda x: decider(x), - "Bernoulli + 'bad' decider with leniency and unobservables", - figure_path + "sl_bad_decider_with_Z" - ) - -gc.collect() -plt.close('all') - -if which == 9: - print("\nBernoulli + Bernoulli") - - dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim) - - decider = lambda x: bernoulliDecider(x, 'X', 'Z', nJudges_M=M_sim) - - perfComp( - dg, lambda x: decider(x), - "Bernoulli + Bernoulli", - figure_path + "sl_bernoulli_bernoulli_with_Z", - ) - -if which == 10: - print("\nBeta_Z = 3, Threshold + batch") - - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=3.0) - - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=3, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Beta_Z = 3, threshold + batch", - figure_path + "sl_threshold_batch_beta_Z_3_with_Z", - ) - -if which == 11: - print("\nBeta_Z = 5, Threshold + batch") - - dg = lambda: thresholdDGWithUnobservables(N_total=N_sim, beta_Z=5.0) - - decider = lambda x: humanDeciderLakkaraju( - x, featureX_col="X", featureZ_col="Z", nJudges_M=M_sim, beta_X=1, beta_Z=5, add_epsilon=True) - - perfComp( - dg, lambda x: decider(x), - "Beta_Z = 5, threshold + batch", - figure_path + "sl_threshold_batch_beta_Z_5_with_Z", - ) \ No newline at end of file