stan_modelling_theoretic.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
# Author: Riku Laine
# Date: 12AUG2019
# Project name: Potential outcomes in model evaluation
# Description: This script creates the figures and results used 
#              in synthetic data experiments.
#
# Parameters:
# -----------
# (1) figure_path : file location for saving the created figures.
# (2) N_sim : Size of simulated data set.
# (3) M_sim : Number of judges in simulated data, 
#             N_sim must be divisible by M_sim!
# (4) which : Which data + outcome analysis should be performed. Ranges from
#             1 to 14. See at the end of the script which figure is which.
#             Separate execution preferred for keeping runtime and memory decent.
# (5) group_amount : How many groups if non-linear model is used.
# (6) stan_code_file_name : Name of file containing the stan model code.
# (7) sigma_tau : Values of prior variance.
#
'''
# Refer to the `notes.tex` file for explanations about the modular framework.

# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as scs
import scipy.special as ssp
import numpy.random as npr
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pystan
import gc

plt.switch_backend('agg')

import sys

# figure storage name
figure_path = sys.argv[1]

# Size of simulated data set
N_sim = int(sys.argv[2])

# Number of judges in simulated data, N_sim must be divisible by M_sim!
M_sim = int(sys.argv[3])

# Which data + outcome generation should be performed.
which = int(sys.argv[4])

# How many groups if jung model is used
group_amount = int(sys.argv[5])

# Name of stan model code file
stan_code_file_name = sys.argv[6]

# Variance prior
sigma_tau = float(sys.argv[7])

# Settings

plt.rcParams.update({'font.size': 16})
plt.rcParams.update({'figure.figsize': (10, 6)})

print("These results have been obtained with the following settings:")

print("Number of observations in the simulated data:", N_sim)

print("Number of judges in the simulated data:", M_sim)

print("Number of groups:", group_amount)

print("Prior for the variances:", sigma_tau)

# Basic functions

def inverseLogit(x):
    return 1.0 / (1.0 + np.exp(-1.0 * x))


def logit(x):
    return np.log(x) - np.log(1.0 - x)


def inverseCumulative(x, mu, sigma):
    '''Compute the inverse of the cumulative distribution of logit-normal
    distribution at x with parameters mu and sigma (mean and st.dev.).'''

    return inverseLogit(ssp.erfinv(2 * x - 1) * np.sqrt(2 * sigma**2) - mu)

def standardError(p, n):
    denominator = p * (1 - p)
    
    return np.sqrt(denominator / n)


# ## Data generation modules

def bernoulliDGWithoutUnobservables(N_total=50000):
    '''Generates data | Variables: X, Y | Outcome from Bernoulli'''

    df = pd.DataFrame()

    # Sample feature X from standard Gaussian distribution, N(0, 1).
    df = df.assign(X=npr.normal(size=N_total))

    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
    df = df.assign(probabilities_Y=inverseLogit(df.X))

    # Draw Y ~ Bernoulli(1 - inverseLogit(X))
    # Note: P(Y=1|X=x) = 1 - P(Y=0|X=x) = 1 - inverseLogit(X)
    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)

    df = df.assign(result_Y=results)

    return df


def thresholdDGWithUnobservables(N_total=50000,
                                 beta_X=1.0,
                                 beta_Z=1.0,
                                 beta_W=0.2):
    '''Generates data | Variables: X, Z, W, Y | Outcome by threshold'''

    df = pd.DataFrame()

    # Sample the variables from standard Gaussian distributions.
    df = df.assign(X=npr.normal(size=N_total))
    df = df.assign(Z=npr.normal(size=N_total))
    df = df.assign(W=npr.normal(size=N_total))

    # Calculate P(Y=0|X, Z, W)
    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)

    df = df.assign(probabilities_Y=probabilities_Y)

    # Result is 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise
    df = df.assign(result_Y=np.where(df.probabilities_Y >= 0.5, 0, 1))

    return df


def bernoulliDGWithUnobservables(N_total=50000,
                                beta_X=1.0,
                                beta_Z=1.0,
                                beta_W=0.2):
    '''Generates data | Variables: X, Z, W, Y | Outcome from Bernoulli'''
    
    df = pd.DataFrame()

    # Sample feature X, Z and W from standard Gaussian distribution, N(0, 1).
    df = df.assign(X=npr.normal(size=N_total))
    df = df.assign(Z=npr.normal(size=N_total))
    df = df.assign(W=npr.normal(size=N_total))

    # Calculate P(Y=0|X=x) = 1 / (1 + exp(-X)) = inverseLogit(X)
    probabilities_Y = inverseLogit(beta_X * df.X + beta_Z * df.Z + beta_W * df.W)

    df = df.assign(probabilities_Y=probabilities_Y)

    # Draw Y from Bernoulli distribution
    results = npr.binomial(n=1, p=1 - df.probabilities_Y, size=N_total)

    df = df.assign(result_Y=results)

    return df


# ## Decider modules

def humanDeciderLakkaraju(df,
                          featureX_col,
                          featureZ_col=None,
                          nJudges_M=100,
                          beta_X=1,
                          beta_Z=1,
                          add_epsilon=True):
    '''Decider module | Non-independent batch decisions.'''
    # Decider as specified by Lakkaraju et al.

    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Compute the number of subjects allocated for each judge.
    nSubjects_N = int(df.shape[0] / nJudges_M)

    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))

    # Sample acceptance rates uniformly from a closed interval
    # from 0.1 to 0.9 and round to tenth decimal place.
    acceptance_rates = npr.uniform(.1, .9, nJudges_M)
    acceptance_rates = np.round(acceptance_rates, 10)


    # Replicate the rates so they can be attached to the corresponding judge ID.
    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))

    # Add noise
    if add_epsilon:
        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
    else:
        epsilon = 0
    
    # Compute probability for jail decision
    if featureZ_col is None:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
    else:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                    beta_Z * df[featureZ_col] + epsilon)


    df = df.assign(probabilities_T=probabilities_T)

    # Sort by judges then probabilities in decreasing order
    # Most dangerous for each judge are at the top.
    df.sort_values(by=["judgeID_J", "probabilities_T"],
                   ascending=False,
                   inplace=True)

    # Iterate over the data. Subject will be given a negative decision
    # if they are in the top (1-r)*100% of the individuals the judge will judge.
    # I.e. if their within-judge-index is under 1 - acceptance threshold times
    # the number of subjects assigned to each judge they will receive a
    # negative decision.
    df.reset_index(drop=True, inplace=True)

    df['decision_T'] = np.where((df.index.values % nSubjects_N) <
                                ((1 - df['acceptanceRate_R']) * nSubjects_N),
                                0, 1)

    df_labeled = df.copy()

    # Hide unobserved
    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan

    return df_labeled, df


def bernoulliDecider(df,
                    featureX_col,
                    featureZ_col=None,
                    nJudges_M=100,
                    beta_X=1,
                    beta_Z=1,
                    add_epsilon=True):
    '''Use X and Z to make a decision with probability 
    P(T=0|X, Z)=inverseLogit(beta_X*X+beta_Z*Z).'''
    # No leniencies involved
    
    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Compute the number of subjects allocated for each judge.
    nSubjects_N = int(df.shape[0] / nJudges_M)

    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))

    if add_epsilon:
        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
    else:
        epsilon = 0
    
    if featureZ_col is None:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)
    else:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                    beta_Z * df[featureZ_col] + epsilon)

    df = df.assign(probabilities_T=probabilities_T)

    # Draw T from Bernoulli distribution
    decisions = npr.binomial(n=1, p=1 - df.probabilities_T, size=df.shape[0])

    df = df.assign(decision_T=decisions)

    # Calculate the acceptance rates.
    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values

    # Replicate the rates so they can be attached to the corresponding judge ID.
    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))

    df_labeled = df.copy()

    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan

    return df_labeled, df


def quantileDecider(df,
                    featureX_col,
                    featureZ_col=None,
                    nJudges_M=100,
                    beta_X=1,
                    beta_Z=1,
                    add_epsilon=True,
                    leniency_upper_limit=0.9):
    '''Assign decisions by the value of inverse cumulative distribution function
    of the logit-normal distribution at leniency r.'''
    
    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Compute the number of subjects allocated for each judge.
    nSubjects_N = int(df.shape[0] / nJudges_M)

    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))

    # Sample acceptance rates uniformly from a closed interval
    # from 0.1 to 0.9 and round to tenth decimal place.
    acceptance_rates = npr.uniform(.1, leniency_upper_limit, nJudges_M)
    acceptance_rates = np.round(acceptance_rates, 10)

    # Replicate the rates so they can be attached to the corresponding judge ID.
    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))

    if add_epsilon:
        epsilon = np.sqrt(0.1) * npr.normal(size=df.shape[0])
    else:
        epsilon = 0
    
    if featureZ_col is None:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] + epsilon)

        # Compute the bounds straight from the inverse cumulative.
        # Assuming X is N(0, 1) so Var(bX*X)=bX**2*Var(X)=bX**2.
        df = df.assign(bounds=inverseCumulative(
            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2)))
    else:
        probabilities_T = inverseLogit(beta_X * df[featureX_col] +
                                    beta_Z * df[featureZ_col] + epsilon)

        # Compute the bounds straight from the inverse cumulative.
        # Assuming X and Z are i.i.d standard Gaussians with variance 1.
        # Thus Var(bx*X+bZ*Z)= bX**2*Var(X)+bZ**2*Var(Z).
        df = df.assign(bounds=inverseCumulative(
            x=df.acceptanceRate_R, mu=0, sigma=np.sqrt(beta_X**2 + beta_Z**2)))

    df = df.assign(probabilities_T=probabilities_T)

    # Assign negative decision if the predicted probability (probabilities_T) is
    # over the judge's threshold (bounds).
    df = df.assign(decision_T=np.where(df.probabilities_T >= df.bounds, 0, 1))
    
    # Calculate the observed acceptance rates.
    acceptance_rates = df.groupby('judgeID_J').mean().decision_T.values

    # Replicate the rates so they can be attached to the corresponding judge ID.
    df.acceptanceRate_R = np.repeat(acceptance_rates, nSubjects_N)

    df_labeled = df.copy()

    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan

    return df_labeled, df


def randomDecider(df, nJudges_M=100, use_acceptance_rates=False):
    '''Doesn't use any information about X and Z to make decisions.
    
    If use_acceptance_rates is False (default) then all decisions are positive
    with probability 0.5. If True, probabilities will be sampled from 
    U(0.1, 0.9) and rounded to tenth decimal place.'''

    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Compute the number of subjects allocated for each judge.
    nSubjects_N = int(df.shape[0] / nJudges_M)

    # Assign judge IDs as running numbering from 0 to nJudges_M - 1
    df = df.assign(judgeID_J=np.repeat(range(0, nJudges_M), nSubjects_N))

    if use_acceptance_rates:
        # Sample acceptance rates uniformly from a closed interval
        # from 0.1 to 0.9 and round to tenth decimal place.
        # 26JUL2019: Fix one leniency to 0.9 so that contraction can compute all
        #            values.
        acceptance_rates = np.append(npr.uniform(.1, .9, nJudges_M - 1), 0.9)
        acceptance_rates = np.round(acceptance_rates, 10)
    else:
        # No real leniency here -> set to 0.5.
        acceptance_rates = np.ones(nJudges_M) * 0.5

    # Replicate the rates so they can be attached to the corresponding judge ID.
    df = df.assign(acceptanceRate_R=np.repeat(acceptance_rates, nSubjects_N))

    df = df.assign(
        decision_T=npr.binomial(n=1, p=df.acceptanceRate_R, size=df.shape[0]))

    df_labeled = df.copy()

    df_labeled.loc[df.decision_T == 0, 'result_Y'] = np.nan

    return df_labeled, df


def biasDecider(df,
                featureX_col,
                featureZ_col=None,
                nJudges_M=100,
                beta_X=1,
                beta_Z=1,
                add_epsilon=True):
    '''
    Biased decider: If X > 1, then X <- X * 0.75. People with high X, 
    get more positive decisions as they should. And if -2 < X -1, then 
    X <- X + 0.5. People with X in [-2, 1], get less positive decisions 
    as they should.
    
    '''

    # If X > 1, then X <- X * 0.75. People with high X, get more positive
    # decisions as they should
    df = df.assign(biased_X=np.where(df[featureX_col] > 1, df[featureX_col] *
                                     0.75, df[featureX_col]))

    # If -2 < X -1, then X <- X + 0.5. People with X in [-2, 1], get less
    # positive decisions as they should
    df.biased_X = np.where((df.biased_X > -2) & (df.biased_X < -1) == 1,
                           df.biased_X + 0.5, df.biased_X)

    # Assert that every judge will have the same number of subjects.
    assert df.shape[0] % nJudges_M == 0, "Can't assign subjets evenly!"

    # Use quantile decider, but judge by the biased X.
    df_labeled, df = quantileDecider(df,
                                     featureX_col='biased_X',
                                     featureZ_col=featureZ_col,
                                     nJudges_M=nJudges_M,
                                     beta_X=beta_X,
                                     beta_Z=beta_Z,
                                     add_epsilon=add_epsilon)

    return df_labeled, df


# ## Evaluator modules

# ### Convenience functions

def fitPredictiveModel(x_train, y_train, x_test, class_value, model_type=None):
    '''
    Fit a predictive model (default logistic regression) with given training 
    instances and return probabilities for test instances to obtain a given 
    class label.
    
    Arguments:
    ----------
    
    x_train -- x values of training instances
    y_train -- y values of training instances
    x_test -- x values of test instances
    class_value -- class label for which the probabilities are counted for.
    model_type -- type of model to be fitted.
    
    Returns:
    --------
    (1) Trained predictive model
    (2) Probabilities for given test inputs for given class.
    '''

    if model_type is None or model_type in ["logistic_regression", "lr"]:
        # Instantiate the model (using the default parameters)
        logreg = LogisticRegression(solver='lbfgs')

        # Check shape and fit the model.
        if x_train.ndim == 1:
            logreg = logreg.fit(x_train.values.reshape(-1, 1), y_train)
        else:
            logreg = logreg.fit(x_train, y_train)

        label_probs_logreg = getProbabilityForClass(x_test, logreg,
                                                    class_value)

        return logreg, label_probs_logreg

    elif model_type in ["random_forest", "rf"]:
        # Instantiate the model
        forest = RandomForestClassifier(n_estimators=100, max_depth=3)

        # Check shape and fit the model.
        if x_train.ndim == 1:
            forest = forest.fit(x_train.values.reshape(-1, 1), y_train)
        else:
            forest = forest.fit(x_train, y_train)

        label_probs_forest = getProbabilityForClass(x_test, forest,
                                                    class_value)

        return forest, label_probs_forest

    elif model_type == "fully_random":

        label_probs = np.ones_like(x_test) / 2

        model_object = lambda x: 0.5

        return model_object, label_probs
    else:
        raise ValueError("Invalid model_type!", model_type)


def getProbabilityForClass(x, model, class_value):
    '''
    Function (wrapper) for obtaining the probability of a class given x and a 
    predictive model.

    Arguments:
    -----------
    x -- individual features, an array of shape (observations, features)
    model -- a trained sklearn model. Predicts probabilities for given x. 
        Should accept input of shape (observations, features)
    class_value -- the resulting class to predict (usually 0 or 1).

    Returns:
    --------
    (1) The probabilities of given class label for each x.
    '''
    if x.ndim == 1:
        # if x is vector, transform to column matrix.
        f_values = model.predict_proba(np.array(x).reshape(-1, 1))
    else:
        f_values = model.predict_proba(x)

    # Get correct column of predicted class, remove extra dimensions and return.
    return f_values[:, model.classes_ == class_value].flatten()

# ### Contraction algorithm
# 
# Below is an implementation of Lakkaraju's team's algorithm presented in 
# [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant
# parameters to be passed to the function are presented in the description.

def contraction(df, judgeIDJ_col, decisionT_col, resultY_col, modelProbS_col,
                accRateR_col, r):
    '''
    This is an implementation of the algorithm presented by Lakkaraju
    et al. in their paper "The Selective Labels Problem: Evaluating 
    Algorithmic Predictions in the Presence of Unobservables" (2017).

    Arguments:
    ----------
    df -- The (Pandas) data frame containing the data, judge decisions,
        judge IDs, results and probability scores.
    judgeIDJ_col -- String, the name of the column containing the judges' IDs
        in df.
    decisionT_col -- String, the name of the column containing the judges' decisions
    resultY_col -- String, the name of the column containing the realization
    modelProbS_col -- String, the name of the column containing the probability
        scores from the black-box model B.
    accRateR_col -- String, the name of the column containing the judges' 
        acceptance rates
    r -- Float between 0 and 1, the given acceptance rate.

    Returns:
    --------
    (1) The estimated failure rate at acceptance rate r.
    '''
    # Get ID of the most lenient judge.
    most_lenient_ID_q = df[judgeIDJ_col].loc[df[accRateR_col].idxmax()]

    # Subset. "D_q is the set of all observations judged by q."
    D_q = df[df[judgeIDJ_col] == most_lenient_ID_q].copy()
    
    ##### Agreement rate computation begins #######
    
    max_leniency = max(df[accRateR_col])
    
    if max_leniency < r:
        return np.nan, np.nan
    
    # Sort D_q
    # "Observations deemed as high risk by B are at the top of this list"
    D_sort_q = D_q.sort_values(by=modelProbS_col, ascending=False)
    
    # The agreement rate is now computed as the percentage of all the positive
    # decisions given by q in the 1-r % of the most dangerous subjects.
    all_negatives = np.sum(D_sort_q[decisionT_col] == 0)
    
    n_most_dangerous = int(round(D_q.shape[0] * (1-max_leniency)))
    
    agreed_decisions = np.sum(D_sort_q[decisionT_col][0:n_most_dangerous] == 0)

    print("The agreement rate of contraction is:", agreed_decisions/all_negatives)
    
    ##### Agreement rate computation ends #######

    # All observations of R_q have observed outcome labels.
    # "R_q is the set of observations in D_q with observed outcome labels."
    R_q = D_q[D_q[decisionT_col] == 1].copy()

    # Sort observations in R_q in descending order of confidence scores S and
    # assign to R_sort_q.
    # "Observations deemed as high risk by B are at the top of this list"
    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)

    number_to_remove = int(
        round((1.0 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))

    # "R_B is the list of observations assigned to t = 1 by B"
    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]

    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0], D_q.shape[0]


# ### Evaluators

def trueEvaluationEvaluator(df, featureX_col, decisionT_col, resultY_col, r):

    df.sort_values(by='B_prob_0_model', inplace=True, ascending=True)