Code styling

e357e897 · Riku-Laine · 62279842 · e357e897 · e357e897
Commit e357e897 authored 5 years ago by Riku-Laine
--- a/analysis_and_scripts/Analysis_07MAY2019_new.ipynb
+++ b/analysis_and_scripts/Analysis_07MAY2019_new.ipynb
--- a/analysis_and_scripts/derivation_validation.ipynb
+++ b/analysis_and_scripts/derivation_validation.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -62,7 +62,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 2,
   "metadata": {
    "scrolled": false
   },
@@ -71,16 +71,16 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Bias: (0.07408376336924696, 8.2249499843419745e-16)\n",
+      "Bias: (0.07558160888120039, 8.391244241812189e-16)\n",
-      "Bias: (0.03499978223514582, 3.8857564094325414e-16)\n",
+      "Bias: (0.03647337206409673, 4.0493577451280163e-16)\n",
      "Analytical: 0.03709585053394618\n",
-      "Estimated:  0.03908404387660974\n",
+      "Estimated:  0.03910840191792689\n",
-      "Difference: -0.0019881933426635634\n",
+      "Difference: -0.0020125513839807097\n",
      "\n",
      "Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\n",
      "\n",
      "Analytical: 0.14973849934787756 0.11264264881393138\n",
-      "Estimated:  0.15038208197340258 0.11129803809679284\n"
+      "Estimated:  0.15185984231839383 0.11275144040046695\n"
     ]
    }
   ],
@@ -176,6 +176,170 @@
    "      analytic_R_on_Y(0, a, b, c, d))\n",
    "print(\"Estimated: \", r1[0], r0[0])"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0\n",
+      "0.0006666666666666666\n",
+      "0.00125\n",
+      "0.0015833333333333333\n",
+      "0.0025833333333333333\n",
+      "0.0035\n",
+      "0.004583333333333333\n",
+      "0.006333333333333333\n",
+      "0.007416666666666667\n",
+      "0.007833333333333333\n",
+      "0.008916666666666666\n",
+      "0.00925\n",
+      "0.009916666666666667\n",
+      "0.010833333333333334\n",
+      "0.012333333333333333\n",
+      "0.0135\n",
+      "0.01425\n",
+      "0.015166666666666667\n",
+      "0.016\n",
+      "0.017\n",
+      "0.017666666666666667\n",
+      "0.01875\n",
+      "0.02\n",
+      "0.021083333333333332\n",
+      "0.022416666666666668\n",
+      "0.0235\n",
+      "0.024083333333333335\n",
+      "0.025166666666666667\n",
+      "0.026166666666666668\n",
+      "0.02775\n",
+      "0.029\n",
+      "0.029916666666666668\n",
+      "0.03158333333333333\n",
+      "0.03283333333333333\n",
+      "0.03383333333333333\n",
+      "0.034833333333333334\n",
+      "0.036\n",
+      "0.037\n",
+      "0.03808333333333333\n",
+      "0.03941666666666667\n",
+      "0.04058333333333333\n",
+      "0.042083333333333334\n",
+      "0.043666666666666666\n",
+      "0.044583333333333336\n",
+      "0.046\n",
+      "0.04716666666666667\n",
+      "0.04841666666666666\n",
+      "0.050166666666666665\n",
+      "0.0515\n",
+      "0.053\n",
+      "0.05491666666666667\n",
+      "0.05575\n",
+      "0.05708333333333333\n",
+      "0.058166666666666665\n",
+      "0.059416666666666666\n",
+      "0.06091666666666667\n",
+      "0.06233333333333333\n",
+      "0.06383333333333334\n",
+      "0.06491666666666666\n",
+      "0.06575\n",
+      "0.06725\n",
+      "0.06891666666666667\n",
+      "0.07\n",
+      "0.07141666666666667\n",
+      "0.07283333333333333\n",
+      "0.07425\n",
+      "0.07691666666666666\n",
+      "0.07791666666666666\n",
+      "0.07933333333333334\n",
+      "0.0805\n",
+      "0.08225\n",
+      "0.08425\n",
+      "0.08541666666666667\n",
+      "0.08666666666666667\n",
+      "0.08841666666666667\n",
+      "0.08975\n",
+      "0.09141666666666666\n",
+      "0.09266666666666666\n",
+      "0.09391666666666666\n",
+      "0.095\n",
+      "0.09633333333333334\n",
+      "0.09791666666666667\n",
+      "0.09916666666666667\n",
+      "0.10066666666666667\n",
+      "0.10158333333333333\n",
+      "0.10358333333333333\n",
+      "0.10466666666666667\n",
+      "0.10608333333333334\n",
+      "0.1075\n",
+      "0.10908333333333334\n",
+      "0.11033333333333334\n",
+      "0.11216666666666666\n",
+      "0.1135\n",
+      "0.11525\n",
+      "0.1175\n",
+      "0.12008333333333333\n",
+      "0.12133333333333333\n",
+      "0.12308333333333334\n",
+      "0.12475\n",
+      "0.12741666666666668\n",
+      "0.12941666666666668\n"
+     ]
+    }
+   ],
+   "source": [
+    "r_, x_, t_, y_ = generateData()\n",
+    "\n",
+    "\n",
+    "def bailIndicator(r, y_model, x_train, x_test):\n",
+    "    '''\n",
+    "    Indicator function for whether a judge will bail or jail a suspect.\n",
+    "    \n",
+    "    Algorithm:\n",
+    "    ----------\n",
+    "    \n",
+    "    (1) Calculate recidivism probabilities from training set with a trained \n",
+    "        model and assign them to predictions_train.\n",
+    "    \n",
+    "    (2) Calculate recidivism probabilities from test set with the trained \n",
+    "        model and assign them to predictions_test.\n",
+    "    \n",
+    "    (3) Construct a cumulative distribution function of the probabilities in\n",
+    "        in predictions_train.\n",
+    "    \n",
+    "    (4)\n",
+    "    For pred in predictions_test:\n",
+    "        \n",
+    "        if pred belongs to a percentile (computed from step (3)) lower than r\n",
+    "            return True\n",
+    "        else\n",
+    "            return False\n",
+    "    \n",
+    "    Returns:\n",
+    "    --------\n",
+    "    (1) Boolean list indicating a bail decision (bail = True).\n",
+    "    '''\n",
+    "\n",
+    "    predictions_train = y_model.predict_proba(x_train)[:, 0]\n",
+    "\n",
+    "    predictions_test = y_model.predict_proba(x_test)[:, 0]\n",
+    "\n",
+    "    return [\n",
+    "        scs.percentileofscore(predictions_train, pred, kind='weak') < r\n",
+    "        for pred in predictions_test\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "recidivated = y_ == 0\n",
+    "\n",
+    "for r__ in range(101):\n",
+    "    released_for_bail = bailIndicator(r__, lr_y, np.array([t, x]).T, np.array([t_, x_]).T)\n",
+    "\n",
+    "    print(np.sum(recidivated & released_for_bail) / y_.shape[0])"
+   ]
  }
 ],
 "metadata": {

 %% Cell type:code id: tags:
 ``` python
 # Imports
 import numpy as np
 import pandas as pd
 from datetime import datetime
 import matplotlib.pyplot as plt
 import scipy.stats as scs
 import scipy.integrate as si
 import seaborn as sns
 import numpy.random as npr
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import PolynomialFeatures
 # Settings
 %matplotlib inline
 plt.rcParams.update({'font.size': 16})
 plt.rcParams.update({'figure.figsize': (14, 7)})
 # Suppress deprecation warnings.
 import warnings
 def fxn():
    warnings.warn("deprecated", DeprecationWarning)
 with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()
 def sigmoid(x):
    '''Return value of sigmoid function (inverse of logit) at x.'''
    return 1 / (1 + np.exp(-x))
 ```
 %% Cell type:markdown id: tags:
 In the code below
 $$P(Y=0|do(R=r))=\int_x P(Y=0|T=1, X=x)P(T=1|R=r, X=x)f_x(x)~dx$$
 where $f_x(x)$ is the pdf of x. (Now $X \sim U(0, 1)$, so $f_x(x)=1$.)
 %% Cell type:code id: tags:
 ``` python
 ###
 # Synthetic data
 # R ~ U(0,1)
 # X ~ N(0,1)
 # T ~ Bin(1, sigmoid(a * r + b * x))
 # Y ~ Bin(1, sigmoid(c * t + d * x))
 # Weights:
 # R -> T: a
 # X -> T: b
 # T -> Y: c
 # X -> Y: d
 N = 12000
 a = 1
 b = 1
 c = 1
 d = 1
 def generateData(N=N, a=a, b=b, c=c, d=d):
    r = npr.uniform(size=N)
    x = npr.uniform(size=N)
    t = npr.binomial(n=1, p=sigmoid(a * r + b * x), size=N)
    y = npr.binomial(n=1, p=sigmoid(c * t + d * x), size=N)
    y[t == 0] = 1
    return r, x, t, y
 def analytic_R_on_Y(r, a, b, c, d):
    '''Return probability of Y=0 with given parameters.'''
    # P(y=0|t=1, x) * p(t=1|r, x) * p(x)
    t_1 = si.quad(
        lambda x: (1 - sigmoid(c * 1 + d * x)) * sigmoid(a * r + b * x) * 1, 0,
        1)
    # P(y=0|t=0, x) * p(t=0|r, x) * p(x)
    # Now equal to 0, hence commented out. See data generation.
    t_0 = si.quad(
        lambda x: (1 - sigmoid(c * 0 + d * x)) * (1 - sigmoid(a * r + b * x)) *
        1, 0, 1)
    return t_1[0]  #+ t_0[0]
 r, x, t, y = generateData()
 # Fit predictive models.
 lr_t = LogisticRegression(solver='lbfgs')
 lr_y = LogisticRegression(solver='lbfgs')
 lr_t = lr_t.fit(np.array([r, x]).T, t)
 lr_y = lr_y.fit(np.array([t[t == 1], x[t == 1]]).T, y[t == 1])
 def causal_effect_of_R_on_Y(r):
    # Equal to integration of P(Y=0|T=0, X=x) * P(T=0|R=r, X=x) * f(x)
    # from 0 to 1
    print("Bias:",
        si.quad(
            lambda x: lr_y.predict_proba(np.array([[0, x]]))[0, 0] * lr_t.
            predict_proba(np.array([[r, x]]))[0, 0], 0, 1))
    # Integrate P(Y=0|T=1, X=x) * P(T=1|R=r, X=x) * f(x) from 0 to 1
    return (si.quad(
        lambda x: lr_y.predict_proba(np.array([[1, x]]))[0, 0] * lr_t.
        predict_proba(np.array([[r, x]]))[0, 1], 0, 1))
 r0 = causal_effect_of_R_on_Y(0)
 r1 = causal_effect_of_R_on_Y(1)
 analytical = analytic_R_on_Y(1, a, b, c, d) - analytic_R_on_Y(0, a, b, c, d)
 print("Analytical:", analytical)
 print("Estimated: ", r1[0] - r0[0])
 print("Difference:", analytical - r1[0] + r0[0])
 print()
 print("Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\n")
 print("Analytical:", analytic_R_on_Y(1, a, b, c, d),
      analytic_R_on_Y(0, a, b, c, d))
 print("Estimated: ", r1[0], r0[0])
 ```
 %% Output
-    Bias: (0.07408376336924696, 8.2249499843419745e-16)
+    Bias: (0.07558160888120039, 8.391244241812189e-16)
-    Bias: (0.03499978223514582, 3.8857564094325414e-16)
+    Bias: (0.03647337206409673, 4.0493577451280163e-16)
    Analytical: 0.03709585053394618
-    Estimated:  0.03908404387660974
+    Estimated:  0.03910840191792689
-    Difference: -0.0019881933426635634
+    Difference: -0.0020125513839807097
    Values for P(y=0|do(r=1)) and P(y=0|do(r=0))
    Analytical: 0.14973849934787756 0.11264264881393138
-    Estimated:  0.15038208197340258 0.11129803809679284
+    Estimated:  0.15185984231839383 0.11275144040046695
+%% Cell type:code id: tags:
+``` python
+r_, x_, t_, y_ = generateData()
+def bailIndicator(r, y_model, x_train, x_test):
+    '''
+    Indicator function for whether a judge will bail or jail a suspect.
+    Algorithm:
+    ----------
+    (1) Calculate recidivism probabilities from training set with a trained
+        model and assign them to predictions_train.
+    (2) Calculate recidivism probabilities from test set with the trained
+        model and assign them to predictions_test.
+    (3) Construct a cumulative distribution function of the probabilities in
+        in predictions_train.
+    (4)
+    For pred in predictions_test:
+        if pred belongs to a percentile (computed from step (3)) lower than r
+            return True
+        else
+            return False
+    Returns:
+    --------
+    (1) Boolean list indicating a bail decision (bail = True).
+    '''
+    predictions_train = y_model.predict_proba(x_train)[:, 0]
+    predictions_test = y_model.predict_proba(x_test)[:, 0]
+    return [
+        scs.percentileofscore(predictions_train, pred, kind='weak') < r
+        for pred in predictions_test
+    ]
+recidivated = y_ == 0
+for r__ in range(101):
+    released_for_bail = bailIndicator(r__, lr_y, np.array([t, x]).T, np.array([t_, x_]).T)
+    print(np.sum(recidivated & released_for_bail) / y_.shape[0])
+```
+%% Output
+    0.0
+    0.0006666666666666666
+    0.00125
+    0.0015833333333333333
+    0.0025833333333333333
+    0.0035
+    0.004583333333333333
+    0.006333333333333333
+    0.007416666666666667
+    0.007833333333333333
+    0.008916666666666666
+    0.00925
+    0.009916666666666667
+    0.010833333333333334
+    0.012333333333333333
+    0.0135
+    0.01425
+    0.015166666666666667
+    0.016
+    0.017
+    0.017666666666666667
+    0.01875
+    0.02
+    0.021083333333333332
+    0.022416666666666668
+    0.0235
+    0.024083333333333335
+    0.025166666666666667
+    0.026166666666666668
+    0.02775
+    0.029
+    0.029916666666666668
+    0.03158333333333333
+    0.03283333333333333
+    0.03383333333333333
+    0.034833333333333334
+    0.036
+    0.037
+    0.03808333333333333
+    0.03941666666666667
+    0.04058333333333333
+    0.042083333333333334
+    0.043666666666666666
+    0.044583333333333336
+    0.046
+    0.04716666666666667
+    0.04841666666666666
+    0.050166666666666665
+    0.0515
+    0.053
+    0.05491666666666667
+    0.05575
+    0.05708333333333333
+    0.058166666666666665
+    0.059416666666666666
+    0.06091666666666667
+    0.06233333333333333
+    0.06383333333333334
+    0.06491666666666666
+    0.06575
+    0.06725
+    0.06891666666666667
+    0.07
+    0.07141666666666667
+    0.07283333333333333
+    0.07425
+    0.07691666666666666
+    0.07791666666666666
+    0.07933333333333334
+    0.0805
+    0.08225
+    0.08425
+    0.08541666666666667
+    0.08666666666666667
+    0.08841666666666667
+    0.08975
+    0.09141666666666666
+    0.09266666666666666
+    0.09391666666666666
+    0.095
+    0.09633333333333334
+    0.09791666666666667
+    0.09916666666666667
+    0.10066666666666667
+    0.10158333333333333
+    0.10358333333333333
+    0.10466666666666667
+    0.10608333333333334
+    0.1075
+    0.10908333333333334
+    0.11033333333333334
+    0.11216666666666666
+    0.1135
+    0.11525
+    0.1175
+    0.12008333333333333
+    0.12133333333333333
+    0.12308333333333334
+    0.12475
+    0.12741666666666668
+    0.12941666666666668