stan_modelling_theoretic.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
# Author: Riku Laine
# Date: 12AUG2019
# Project name: Potential outcomes in model evaluation
# Description: This script creates the figures and results used 
#              in synthetic data experiments.
#
# Parameters:
# -----------
# (1) figure_path : file location for saving the created figures.
# (2) N_sim : Size of simulated data set.
# (3) M_sim : Number of judges in simulated data, 
#             N_sim must be divisible by M_sim!
# (4) which : Which data + outcome analysis should be performed. Ranges from
#             1 to 14. See at the end of the script which figure is which.
#             Separate execution preferred for keeping runtime and memory decent.
# (5) group_amount : How many groups if non-linear model is used.
# (6) stan_code_file_name : Name of file containing the stan model code.
# (7) sigma_tau : Values of prior variance.
#
'''
# Refer to the `notes.tex` file for explanations about the modular framework.

# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as scs
import scipy.special as ssp
import numpy.random as npr
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pystan
import gc

plt.switch_backend('agg')

import sys

# figure storage name
figure_path = sys.argv[1]

# Size of simulated data set
N_sim = int(sys.argv[2])

# Number of judges in simulated data, N_sim must be divisible by M_sim!
M_sim = int(sys.argv[3])

# Which data + outcome generation should be performed.
which = int(sys.argv[4])

# How many groups if jung model is used
group_amount = int(sys.argv[5])

# Name of stan model code file
stan_code_file_name = sys.argv[6]

# Variance prior
sigma_tau = float(sys.argv[7])

# Settings

plt.rcParams.update({'font.size': 16})
plt.rcParams.update({'figure.figsize': (10, 6)})

print("These results have been obtained with the following settings:")

print("Number of observations in the simulated data:", N_sim)

print("Number of judges in the simulated data:", M_sim)

print("Number of groups:", group_amount)

print("Prior for the variances:", sigma_tau)

# Basic functions

def inverseLogit(x):
    return 1.0 / (1.0 + np.exp(-1.0 * x))


def logit(x):
    return np.log(x) - np.log(1.0 - x)


def inverseCumulative(x, mu, sigma):
    '''Compute the inverse of the cumulative distribution of logit-normal
    distribution at x with parameters mu and sigma (mean and st.dev.).'''

    return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)

def standardError(p, n):
    denominator = p * (1 - p)
    
    return np.sqrt(denominator / n)


# ## Data generation modules

def bernoulliDGWithoutUnobservables(N_total=50000):
    '''Generates data | Variables: X, Y | Outcome from Bernoulli'''

    df = pd.DataFrame()

    # Sample feature X from standard Gaussian distribution, N(0, 1).
    df = df.assign(X=npr.normal(size=N_total))

    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
    df = df.assign(probabilities_Y=inverseLogit(df.X))

    # Draw Y ~ Bernoulli(1 - inverseLogit(X))
    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X)
    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)

    df = df.assign(result_Y=results)

    return df


def thresholdDGWithUnobservables(N_total=50000,
                                 beta_X=1.0,
                                 beta_Z=1.0,
                                 beta_W=0.2):
    '''Generates data | Variables: X, Z, W, Y | Outcome by threshold'''

    df = pd.DataFrame()

    # Sample the variables from standard Gaussian distributions.
    df = df.assign(X=npr.normal(size=N_total))
    df = df.assign(Z=npr.normal(size=N_total))
    df = df.assign(W=npr.normal(size=N_total))

    # Calculate P(Y=0|X, Z, W)
    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)

    df = df.assign(probabilities_Y=probabilities_Y)

    # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise
    df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1))

    return df


def bernoulliDGWithUnobservables(N_total=50000,
                                beta_X=1.0,
                                beta_Z=1.0,
                                beta_W=0.2):
    '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli'''
    
    df = pd.DataFrame()

    # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1).
    df = df.assign(X=npr.normal(size=N_total))
    df = df.assign(Z=npr.normal(size=N_total))
    df = df.assign(W=npr.normal(size=N_total))

    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)

    df = df.assign(probabilities_Y=probabilities_Y)

    # Draw Y from Bernoulli distribution
    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)

    df = df.assign(result_Y=results)

    return df


# ## Decider modules

def humanDeciderLakkaraju(df,
                          featureX_col,
                          featureZ_col=None,
                          nJudges_M=100,
                          beta_X=1,
                          beta_Z=1,
                          add_epsilon=True):
    '''Decider module | Non-independent batch decisions.'''
    # Decider as specified by Lakkaraju et al.

    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Compute the number of subjects allocated for each judge.
    nSubjects_N = int(df.shape[0] / nJudges_M)

    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))

    # Sample acceptance rates uniformly from a closed interval
    # from 0.1 to 0.9 and round to tenth decimal place.
    acceptance_rates = npr.uniform(.1, .9, nJudges_M)
    acceptance_rates = np.round(acceptance_rates, 10)


    # Replicate the rates so they can be attached to the corresponding judge ID.
    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))

    # Add noise
    if add_epsilon:
        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
    else:
        epsilon = 0
    
    # Compute probability for jail decision
    if featureZ_col is None:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
    else:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                    beta_Z * df[featureZ_col] + epsilon)


    df = df.assign(probabilities_T=probabilities_T)

    # Sort by judges then probabilities in decreasing order
    # Most dangerous for each judge are at the top.
    df.sort_values(by=["judgeID_J", "probabilities_T"],
                   ascending=False,
                   inplace=True)

    # Iterate over the data. Subject will be given a negative decision
    # if they are in the top (1-r)*100% of the individuals the judge will judge.
    # I.e. if their within-judge-index is under 1 - acceptance threshold times
    # the number of subjects assigned to each judge they will receive a
    # negative decision.
    df.reset_index(drop=True, inplace=True)

    df['decision_T'] = np.where((df.index.values % nSubjects_N) <
                                ((1 - df['acceptanceRate_R']) * nSubjects_N),
                                0, 1)

    df_labeled = df.copy()

    # Hide unobserved
    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan

    return df_labeled, df


def bernoulliDecider(df,
                    featureX_col,
                    featureZ_col=None,
                    nJudges_M=100,
                    beta_X=1,
                    beta_Z=1,
                    add_epsilon=True):
    '''Use X and Z to make a decision with probability 
    P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).'''
    # No leniencies involved
    
    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Compute the number of subjects allocated for each judge.
    nSubjects_N = int(df.shape[0] / nJudges_M)

    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))

    if add_epsilon:
        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
    else:
        epsilon = 0
    
    if featureZ_col is None:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
    else:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                    beta_Z * df[featureZ_col] + epsilon)

    df = df.assign(probabilities_T=probabilities_T)

    # Draw T from Bernoulli distribution
    decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0])

    df = df.assign(decision_T=decisions)

    # Calculate the acceptance rates.
    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values

    # Replicate the rates so they can be attached to the corresponding judge ID.
    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))

    df_labeled = df.copy()

    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan

    return df_labeled, df


def quantileDecider(df,
                    featureX_col,
                    featureZ_col=None,
                    nJudges_M=100,
                    beta_X=1,
                    beta_Z=1,
                    add_epsilon=True,
                    leniency_upper_limit=0.9):
    '''Assign decisions by the value of inverse cumulative distribution function
    of the logit-normal distribution at leniency r.'''
    
    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Compute the number of subjects allocated for each judge.
    nSubjects_N = int(df.shape[0] / nJudges_M)

    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))

    # Sample acceptance rates uniformly from a closed interval
    # from 0.1 to 0.9 and round to tenth decimal place.
    acceptance_rates = npr.uniform(.1, leniency_upper_limit, nJudges_M)
    acceptance_rates = np.round(acceptance_rates, 10)

    # Replicate the rates so they can be attached to the corresponding judge ID.
    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))

    if add_epsilon:
        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
    else:
        epsilon = 0
    
    if featureZ_col is None:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)

        # Compute the bounds straight from the inverse cumulative.
        # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2.
        df = df.assign(bounds=inverseCumulative(
            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2)))
    else:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                    beta_Z * df[featureZ_col] + epsilon)

        # Compute the bounds straight from the inverse cumulative.
        # Assuming X and Z are i.i.d standard Gaussians with variance 1.
        # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z).
        df = df.assign(bounds=inverseCumulative(
            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2)))

    df = df.assign(probabilities_T=probabilities_T)

    # Assign negative decision if the predicted probability (probabilities_T) is
    # over the judge's threshold (bounds).
    df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1))
    
    # Calculate the observed acceptance rates.
    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values

    # Replicate the rates so they can be attached to the corresponding judge ID.
    df.acceptanceRate_R = np.repeat(acceptance_rates, nSubjects_N)

    df_labeled = df.copy()

    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan

    return df_labeled, df


def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
    '''Doesn't use any information about X and Z to make decisions.
    
    If use_acceptance_rates is False (default) then all decisions are positive
    with probability 0.5. If True, probabilities will be sampled from 
    U(0.1, 0.9) and rounded to tenth decimal place.'''

    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Compute the number of subjects allocated for each judge.
    nSubjects_N = int(df.shape[0] / nJudges_M)

    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))

    if use_acceptance_rates:
        # Sample acceptance rates uniformly from a closed interval
        # from 0.1 to 0.9 and round to tenth decimal place.
        # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
        #            values.
        acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
        acceptance_rates = np.round(acceptance_rates, 10)
    else:
        # No real leniency here -> set to 0.5.
        acceptance_rates = np.ones(nJudges_M) * 0.5

    # Replicate the rates so they can be attached to the corresponding judge ID.
    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))

    df = df.assign(
        decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0]))

    df_labeled = df.copy()

    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan

    return df_labeled, df


def biasDecider(df,
                featureX_col,
                featureZ_col=None,
                nJudges_M=100,
                beta_X=1,
                beta_Z=1,
                add_epsilon=True):
    '''
    Biased decider: If X > 1, then X <- X * 0.75. People with high X, 
    get more positive decisions as they should. And if -2 < X -1, then 
    X <- X + 0.5. People with X in [-2, 1], get less positive decisions 
    as they should.
    
    '''

    # If X > 1, then X <- X * 0.75. People with high X, get more positive
    # decisions as they should
    df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] *
                                     0.75, df[featureX_col]))

    # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less
    # positive decisions as they should
    df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1,
                           df.biased_X + 0.5, df.biased_X)

    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Use quantile decider, but judge by the biased X.
    df_labeled, df = quantileDecider(df,
                                     featureX_col='biased_X',
                                     featureZ_col=featureZ_col,
                                     nJudges_M=nJudges_M,
                                     beta_X=beta_X,
                                     beta_Z=beta_Z,
                                     add_epsilon=add_epsilon)

    return df_labeled, df


# ## Evaluator modules

# ### Convenience functions

def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
    '''
    Fit a predictive model (default logistic regression) with given training 
    instances and return probabilities for test instances to obtain a given 
    class label.
    
    Arguments:
    ----------
    
    x_train -- x values of training instances
    y_train -- y values of training instances
    x_test -- x values of test instances
    class_value -- class label for which the probabilities are counted for.
    model_type -- type of model to be fitted.
    
    Returns:
    --------
    (1) Trained predictive model
    (2) Probabilities for given test inputs for given class.
    '''

    if model_type is None or model_type in ["logistic_regression", "lr"]:
        # Instantiate the model (using the default parameters)
        logreg = LogisticRegression(solver='lbfgs')

        # Check shape and fit the model.
        if x_train.ndim == 1:
            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
        else:
            logreg = logreg.fit(x_train, y_train)

        label_probs_logreg = getProbabilityForClass(x_test, logreg,
                                                    class_value)

        return logreg, label_probs_logreg

    elif model_type in ["random_forest", "rf"]:
        # Instantiate the model
        forest = RandomForestClassifier(n_estimators=100, max_depth=3)

        # Check shape and fit the model.
        if x_train.ndim == 1:
            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
        else:
            forest = forest.fit(x_train, y_train)

        label_probs_forest = getProbabilityForClass(x_test, forest,
                                                    class_value)

        return forest, label_probs_forest

    elif model_type == "fully_random":

        label_probs = np.ones_like(x_test) / 2

        model_object = lambda x: 0.5

        return model_object, label_probs
    else:
        raise ValueError("Invalid model_type!", model_type)


def getProbabilityForClass(x, model, class_value):
    '''
    Function (wrapper) for obtaining the probability of a class given x and a 
    predictive model.

    Arguments:
    -----------
    x -- individual features, an array of shape (observations, features)
    model -- a trained sklearn model. Predicts probabilities for given x. 
        Should accept input of shape (observations, features)
    class_value -- the resulting class to predict (usually 0 or 1).

    Returns:
    --------
    (1) The probabilities of given class label for each x.
    '''
    if x.ndim == 1:
        # if x is vector, transform to column matrix.
        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
    else:
        f_values = model.predict_proba(x)

    # Get correct column of predicted class, remove extra dimensions and return.
    return f_values[:, model.classes_ == class_value].flatten()

# ### Contraction algorithm
# 
# Below is an implementation of Lakkaraju's team's algorithm presented in 
# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
# parameters to be passed to the function are presented in the description.

def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
                accRateR_col, r):
    '''
    This is an implementation of the algorithm presented by Lakkaraju
    et al. in their paper "The Selective Labels Problem: Evaluating 
    Algorithmic Predictions in the Presence of Unobservables" (2017).

    Arguments:
    ----------
    df -- The (Pandas) data frame containing the data, judge decisions,
        judge IDs, results and probability scores.
    judgeIDJ_col -- String, the name of the column containing the judges' IDs
        in df.
    decisionT_col -- String, the name of the column containing the judges' decisions
    resultY_col -- String, the name of the column containing the realization
    modelProbS_col -- String, the name of the column containing the probability
        scores from the black-box model B.
    accRateR_col -- String, the name of the column containing the judges' 
        acceptance rates
    r -- Float between 0 and 1, the given acceptance rate.

    Returns:
    --------
    (1) The estimated failure rate at acceptance rate r.
    '''
    # Get ID of the most lenient judge.
    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]

    # Subset. "D_q is the set of all observations judged by q."
    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
    
    ##### Agreement rate computation begins #######
    
    max_leniency = max(df[accRateR_col])
    
    if max_leniency < r:
        return np.nan, np.nan
    
    # Sort D_q
    # "Observations deemed as high risk by B are at the top of this list"
    D_sort_q = D_q.sort_values(by=modelProbS_col, ascending=False)
    
    # The agreement rate is now computed as the percentage of all the positive
    # decisions given by q in the 1-r % of the most dangerous subjects.
    all_negatives = np.sum(D_sort_q[decisionT_col] == 0)
    
    n_most_dangerous = int(round(D_q.shape[0] * (1-max_leniency)))
    
    agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous] == 0)

    print("The agreement rate of contraction is:", agreed_decisions/all_negatives)
    
    ##### Agreement rate computation ends #######

    # All observations of R_q have observed outcome labels.
    # "R_q is the set of observations in D_q with observed outcome labels."
    R_q = D_q[D_q[decisionT_col] == 1].copy()

    # Sort observations in R_q in descending order of confidence scores S and
    # assign to R_sort_q.
    # "Observations deemed as high risk by B are at the top of this list"
    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)

    number_to_remove = int(
        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))

    # "R_B is the list of observations assigned to t = 1 by B"
    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]

    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]


# ### Evaluators

def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r):

    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)

    to_release = int(round(df.shape[0] * r))
    
    failed = df[resultY_col][0:to_release] == 0

    return np.sum(failed) / df.shape[0], df.shape[0]


def labeledOutcomesEvaluator(df,
                             featureX_col,
                             decisionT_col,
                             resultY_col,
                             r,
                             adjusted=False):

    df_observed = df.loc[df[decisionT_col] == 1, :]

    df_observed = df_observed.sort_values(by='B_prob_0_model',
                                              inplace=False,
                                              ascending=True)

    to_release = int(round(df_observed.shape[0] * r))
    
    failed = df_observed[resultY_col][0:to_release] == 0

    if adjusted:
        return np.mean(failed), df.shape[0]

    return np.sum(failed) / df.shape[0], df.shape[0]


def humanEvaluationEvaluator(df, judgeIDJ_col, decisionT_col, resultY_col,
                             accRateR_col, r):

    # Get judges with correct leniency as list
    is_correct_leniency = df[accRateR_col].round(1) == r

    # No judges with correct leniency
    if np.sum(is_correct_leniency) == 0:
        return np.nan, np.nan

    correct_leniency_list = df.loc[is_correct_leniency, judgeIDJ_col]

    # Released are the people they judged and released, T = 1
    released = df[df[judgeIDJ_col].isin(correct_leniency_list)
                  & (df[decisionT_col] == 1)]

    failed = released[resultY_col] == 0
    
    # Get their failure rate, aka ratio of reoffenders to number of people judged in total
    return np.sum(failed) / correct_leniency_list.shape[0], correct_leniency_list.shape[0]


def monteCarloEvaluator(df,
                        featureX_col,
                        decisionT_col,
                        resultY_col,
                        accRateR_col,
                        r,
                        mu_X=0,
                        mu_Z=0,
                        beta_X=1,
                        beta_Z=1,
                        sigma_X=1,
                        sigma_Z=1):

    # Compute the predicted/assumed decision bounds for all the judges.
    q_r = inverseCumulative(x=df[accRateR_col],
                             mu=mu_X + mu_Z,
                             sigma=np.sqrt((beta_X * sigma_X)**2 +
                                           (beta_Z * sigma_Z)**2))

    df = df.assign(bounds=logit(q_r) - df[featureX_col])

    # Compute the expectation of Z when it is known to come from truncated
    # Gaussian.
    alphabeta = (df.bounds - mu_Z) / (sigma_Z)

    Z_ = scs.norm.sf(alphabeta, loc=mu_Z, scale=sigma_Z)  # 1 - cdf(ab)

    # E(Z | Z > a). Expectation of Z if negative decision.
    exp_lower_trunc = mu_Z + (sigma_Z * scs.norm.pdf(alphabeta)) / Z_

    # E(Z | Z < b). Expectation of Z if positive decision.
    exp_upper_trunc = mu_Z - (
        sigma_Z * scs.norm.pdf(alphabeta)) / scs.norm.cdf(alphabeta)

    exp_Z = (1 - df[decisionT_col]
             ) * exp_lower_trunc + df[decisionT_col] * exp_upper_trunc

    # Attach the predicted probability for Y=0 to data.
    df = df.assign(predicted_Y=inverseLogit(df[featureX_col] + exp_Z))

    # Predictions drawn from binomial.
    predictions = npr.binomial(n=1, p=1 - df.predicted_Y, size=df.shape[0])

    df[resultY_col] = np.where(df[decisionT_col] == 0, predictions,
                                 df[resultY_col])

    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)

    to_release = int(round(df.shape[0] * r))
    
    failed = df[resultY_col][0:to_release] == 0

    return np.sum(failed) / df.shape[0], df.shape[0]

def drawDiagnostics(title, save_name, f_rates, titles):
    
    cols = 2
    rows = np.ceil(len(f_rates) / cols)
    
    plt.figure(figsize=(16, 4.5*rows+1))

    ax = plt.subplot(rows, cols, 1)
    x_ax = np.arange(1, 9, 1) / 10

    plt.boxplot(f_rates[0], labels=x_ax)

    plt.title(titles[0])
    plt.xlabel('Acceptance rate')
    plt.ylabel('Failure rate')
    plt.grid()

    for i in range(len(f_rates)):
        plt.subplot(rows, cols, i + 1, sharey=ax)

        plt.boxplot(f_rates[i], labels=x_ax)

        plt.title(titles[i])
        plt.xlabel('Acceptance rate')
        plt.ylabel('Failure rate')
        plt.grid()

    plt.tight_layout()
    plt.subplots_adjust(top=0.85)
    plt.suptitle(title, y=0.96, weight='bold')

    plt.savefig(save_name + '_diagnostic_plot')

    plt.show()

def perfComp(dgModule, deciderModule, title, save_name, model_type=None, 
             fit_with_Z=False, nIter=10):
    '''Performance comparison, a.k.a the main figure. 
    
    Parameters:
    -----------
        dgModule: Module creating the data
        decisderModule: Module assigning the decisions
        title: Title for the diagnostic figure
        save_name: name for saving
        model_type: what kind of model should be fitted. Logistic regression
                    ("lr"), random forest ("rf") or fully random 
                    ("fully_random")
        fit_with_Z: should the predictive model be fit with Z (the latent 
                    variable)
        nIter: Number of train-test splits to be performed.
        
    '''
    
    failure_rates = np.zeros((8, 4))
    failure_stds = np.zeros((8, 4))

    f_rate_true = np.zeros((nIter, 8))
    f_rate_label = np.zeros((nIter, 8))
    f_rate_cont = np.zeros((nIter, 8))
    f_rate_cf = np.zeros((nIter, 8))

    # Create data
    df = dgModule()

    # Assign decicions
    df_labeled, df_unlabeled = deciderModule(df)
    
    for i in range(nIter):

        # Split data
        train, test_labeled = train_test_split(df_labeled, test_size=0.5)
        
        # Assign same observations to unlabeled dat
        test_unlabeled = df_unlabeled.iloc[test_labeled.index.values]

        # Train model
        if fit_with_Z:
            B_model, predictions = fitPredictiveModel(
                    train.loc[train['decision_T'] == 1, ['X', 'Z']],
                    train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled[['X', 'Z']], 
                    0, model_type=model_type)
    
        else:
            B_model, predictions = fitPredictiveModel(
                    train.loc[train['decision_T'] == 1, 'X'],
                    train.loc[train['decision_T'] == 1, 'result_Y'], test_labeled['X'], 
                    0, model_type=model_type)

        # Attach predictions to data
        test_labeled = test_labeled.assign(B_prob_0_model=predictions)
        test_unlabeled = test_unlabeled.assign(B_prob_0_model=predictions)
    
        test_labeled.sort_values(by='B_prob_0_model', inplace=True, ascending=True)
    
        # Assign group IDs
        kk_array = pd.qcut(test_labeled['B_prob_0_model'], group_amount, labels=False)
    
        # Find observed values
        observed = test_labeled['decision_T'] == 1
    
        # Assign data to the model
        
        ###############################
        # Change this assignment if you want to use predictions as input to the
        # model.
        ###############################
        dat = dict(D=1,
                   N_obs=np.sum(observed),
                   N_cens=np.sum(~observed),
                   K=group_amount,
                   sigma_tau=sigma_tau,
                   M=len(set(df_unlabeled['judgeID_J'])),
                   jj_obs=test_labeled.loc[observed, 'judgeID_J']+1,
                   jj_cens=test_labeled.loc[~observed, 'judgeID_J']+1,
                   kk_obs=kk_array[observed]+1,
                   kk_cens=kk_array[~observed]+1,
                   dec_obs=test_labeled.loc[observed, 'decision_T'],
                   dec_cens=test_labeled.loc[~observed, 'decision_T'],
                   X_obs=test_labeled.loc[observed, 'X'].values.reshape(-1,1),
                   X_cens=test_labeled.loc[~observed, 'X'].values.reshape(-1,1),
                   y_obs=test_labeled.loc[observed, 'result_Y'].astype(int))

        # Do the sampling - Change this if you wish to use other methods.
        fit = sm.sampling(data=dat, chains=5, iter=5000, control = dict(adapt_delta=0.9))
                
        pars = fit.extract()
        
        
        plt.figure(figsize=(15,30))
    
        fit.plot();
    
        plt.savefig(save_name + '_stan_convergence_diagnostic_plot_' + str(i))
        
        plt.show()
        plt.close('all')
        gc.collect()
    
        print(fit, file=open(save_name + '_stan_fit_diagnostics_' + str(i) + '.txt', 'w'))

        # Bayes
        
        # Format matrix, each row is a sample of the posterior
        # columns are observations
                
        y_imp = np.ones((pars['y_est'].shape[0], test_labeled.shape[0]))
        
        # Impute the unobserved with the estimated 
        # Reverse the binary coding to compute failures as sum. 0 == 0 = 1
        y_imp[:, ~observed] = 1 - pars['y_est']
        
        # Remove the variables to conserve memory
        del(pars)
        del(fit)
        
        # Impute the observed
        y_imp[:, observed] = 1 - test_labeled.loc[observed, 'result_Y']
                            
        for r in range(1, 9):
            
            to_release = np.round(test_labeled.shape[0] * (r / 10)).astype(int)
                        
            failed = np.sum(y_imp[:, 0:to_release], axis=1)
            
            f_rate_cf[i, r - 1] = np.mean(failed / test_labeled.shape[0])
    
            print(".", end="")
    
            # True evaluation
    
            f_rate_true[i, r - 1], N = trueEvaluationEvaluator(test_unlabeled,
                       'X', 'decision_T', 'result_Y', r / 10)
                
            # Labeled outcomes only
    
            f_rate_label[i, r - 1], N = labeledOutcomesEvaluator(test_labeled, 
                        'X', 'decision_T', 'result_Y', r / 10)
            
            # Contraction
    
            f_rate_cont[i, r - 1], N = contraction(test_labeled, 'judgeID_J', 
                       'decision_T', 'result_Y', 'B_prob_0_model', 
                       'acceptanceRate_R', r / 10)
            
    failure_rates[:, 0] = np.mean(f_rate_true, axis=0)
    failure_rates[:, 1] = np.mean(f_rate_label, axis=0)
    failure_rates[:, 2] = np.mean(f_rate_cont, axis=0)
    failure_rates[:, 3] = np.mean(f_rate_cf, axis=0)

    failure_stds[:, 0] = np.std(f_rate_true, axis=0)
    failure_stds[:, 1] = np.std(f_rate_label, axis=0)
    failure_stds[:, 2] = np.std(f_rate_cont, axis=0)
    failure_stds[:, 3] = np.std(f_rate_cf, axis=0)

    x_ax = np.arange(0.1, 0.9, 0.1)

    labels = ['True Evaluation', 'Labeled outcomes', 'Contraction', 
              'Counterfactuals']
    
    colours = ['g', 'magenta', 'b', 'r']
    
    line_styles = ['--', ':', '-.', '-']

    for i in range(failure_rates.shape[1]):
        plt.errorbar(x_ax,
                     failure_rates[:, i],
                     label=labels[i],
                     c=colours[i],
                     linestyle=line_styles[i],
                     yerr=failure_stds[:, i])

    #plt.title('Failure rate vs. Acceptance rate')
    plt.xlabel('Acceptance rate')
    plt.ylabel('Failure rate')
    plt.legend()
    plt.grid()
    
    plt.savefig(save_name + '_all')
    
    plt.show()

    print("\nFailure rates:")
    print(np.array2string(failure_rates, formatter={'float_kind':lambda x: "%.5f" % x}))
    
    print("\nStandard deviations:")
    print(np.array2string(failure_stds, formatter={'float_kind':lambda x: "%.5f" % x}))

    print("\nMean absolute errors:")
    for i in range(1, failure_rates.shape[1]):
        print(
            labels[i].ljust(len(max(labels, key=len))),
            np.round(
                np.mean(np.abs(failure_rates[:, 0] - failure_rates[:, i])), 5))
        
    drawDiagnostics(title=title,
                save_name=save_name,
                f_rates=[f_rate_true, f_rate_label, f_rate_cont, f_rate_cf],
                titles=labels)

# Compile stan model
sm = pystan.StanModel(file=stan_code_file_name)

if which == 1:
    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
    
    print("Decision-maker in the data and model: random and random")

    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)

    decider = lambda x: randomDecider(x, nJudges_M=M_sim, use_acceptance_rates=True)

    perfComp(
        dg, lambda x: decider(x),
        "Fluctuation of failure rate estimates across iterations\n" +
        "Bernoulli + independent decisions, without unobservables",
        figure_path + "sl_bernoulli_independent_without_Z",
        model_type="fully_random", 
        fit_with_Z=False
    )
    
plt.close('all')
gc.collect()

if which == 2:
    print("\nWith unobservables (Bernoullian outcome + independent decisions)")
    
    print("Decision-maker in the data and model: random and y ~ x")

    dg = lambda: bernoulliDGWithUnobservables(N_total=N_sim)

    decider = lambda x: randomDecider(x, nJudges_M=M_sim, use_acceptance_rates=True)

    perfComp(