derivation_validation.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.stats as scs\n",
    "import scipy.integrate as si\n",
    "import seaborn as sns\n",
    "import numpy.random as npr\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "\n",
    "# Settings\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "plt.rcParams.update({'font.size': 16})\n",
    "plt.rcParams.update({'figure.figsize': (14, 7)})\n",
    "\n",
    "# Suppress deprecation warnings.\n",
    "\n",
    "import warnings\n",
    "\n",
    "\n",
    "def fxn():\n",
    "    warnings.warn(\"deprecated\", DeprecationWarning)\n",
    "\n",
    "\n",
    "with warnings.catch_warnings():\n",
    "    warnings.simplefilter(\"ignore\")\n",
    "    fxn()\n",
    "    \n",
    "def sigmoid(x):\n",
    "    '''Return value of sigmoid function (inverse of logit) at x.'''\n",
    "\n",
    "    return 1 / (1 + np.exp(-x))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the code below\n",
    "\n",
    "$$P(Y=0|do(R=r))=\\int_x P(Y=0|T=1, X=x)P(T=1|R=r, X=x)f_x(x)~dx$$\n",
    "\n",
    "where $f_x(x)$ is the pdf of x. (Now $X \\sim U(0, 1)$, so $f_x(x)=1$.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bias: (0.07558160888120039, 8.391244241812189e-16)\n",
      "Bias: (0.03647337206409673, 4.0493577451280163e-16)\n",
      "Analytical: 0.03709585053394618\n",
      "Estimated:  0.03910840191792689\n",
      "Difference: -0.0020125513839807097\n",
      "\n",
      "Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\n",
      "\n",
      "Analytical: 0.14973849934787756 0.11264264881393138\n",
      "Estimated:  0.15185984231839383 0.11275144040046695\n"
     ]
    }
   ],
   "source": [
    "###\n",
    "# Synthetic data\n",
    "\n",
    "# R ~ U(0,1)\n",
    "# X ~ N(0,1)\n",
    "# T ~ Bin(1, sigmoid(a * r + b * x))\n",
    "# Y ~ Bin(1, sigmoid(c * t + d * x))\n",
    "\n",
    "# Weights:\n",
    "# R -> T: a\n",
    "# X -> T: b\n",
    "# T -> Y: c\n",
    "# X -> Y: d\n",
    "\n",
    "N = 12000\n",
    "a = 1\n",
    "b = 1\n",
    "c = 1\n",
    "d = 1\n",
    "\n",
    "\n",
    "def generateData(N=N, a=a, b=b, c=c, d=d):\n",
    "\n",
    "    r = npr.uniform(size=N)\n",
    "    x = npr.uniform(size=N)\n",
    "\n",
    "    t = npr.binomial(n=1, p=sigmoid(a * r + b * x), size=N)\n",
    "\n",
    "    y = npr.binomial(n=1, p=sigmoid(c * t + d * x), size=N)\n",
    "\n",
    "    y[t == 0] = 1\n",
    "\n",
    "    return r, x, t, y\n",
    "\n",
    "\n",
    "def analytic_R_on_Y(r, a, b, c, d):\n",
    "    '''Return probability of Y=0 with given parameters.'''\n",
    "\n",
    "    # P(y=0|t=1, x) * p(t=1|r, x) * p(x)\n",
    "    t_1 = si.quad(\n",
    "        lambda x: (1 - sigmoid(c * 1 + d * x)) * sigmoid(a * r + b * x) * 1, 0,\n",
    "        1)\n",
    "\n",
    "    # P(y=0|t=0, x) * p(t=0|r, x) * p(x)\n",
    "    # Now equal to 0, hence commented out. See data generation.\n",
    "    t_0 = si.quad(\n",
    "        lambda x: (1 - sigmoid(c * 0 + d * x)) * (1 - sigmoid(a * r + b * x)) *\n",
    "        1, 0, 1)\n",
    "\n",
    "    return t_1[0]  #+ t_0[0]\n",
    "\n",
    "\n",
    "r, x, t, y = generateData()\n",
    "\n",
    "# Fit predictive models.\n",
    "lr_t = LogisticRegression(solver='lbfgs')\n",
    "lr_y = LogisticRegression(solver='lbfgs')\n",
    "\n",
    "lr_t = lr_t.fit(np.array([r, x]).T, t)\n",
    "lr_y = lr_y.fit(np.array([t[t == 1], x[t == 1]]).T, y[t == 1])\n",
    "\n",
    "\n",
    "def causal_effect_of_R_on_Y(r):\n",
    "\n",
    "    # Equal to integration of P(Y=0|T=0, X=x) * P(T=0|R=r, X=x) * f(x)\n",
    "    # from 0 to 1\n",
    "    print(\"Bias:\",\n",
    "        si.quad(\n",
    "            lambda x: lr_y.predict_proba(np.array([[0, x]]))[0, 0] * lr_t.\n",
    "            predict_proba(np.array([[r, x]]))[0, 0], 0, 1))\n",
    "\n",
    "    # Integrate P(Y=0|T=1, X=x) * P(T=1|R=r, X=x) * f(x) from 0 to 1\n",
    "    return (si.quad(\n",
    "        lambda x: lr_y.predict_proba(np.array([[1, x]]))[0, 0] * lr_t.\n",
    "        predict_proba(np.array([[r, x]]))[0, 1], 0, 1))\n",
    "\n",
    "\n",
    "r0 = causal_effect_of_R_on_Y(0)\n",
    "r1 = causal_effect_of_R_on_Y(1)\n",
    "\n",
    "analytical = analytic_R_on_Y(1, a, b, c, d) - analytic_R_on_Y(0, a, b, c, d)\n",
    "\n",
    "print(\"Analytical:\", analytical)\n",
    "print(\"Estimated: \", r1[0] - r0[0])\n",
    "print(\"Difference:\", analytical - r1[0] + r0[0])\n",
    "print()\n",
    "print(\"Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\\n\")\n",
    "print(\"Analytical:\", analytic_R_on_Y(1, a, b, c, d),\n",
    "      analytic_R_on_Y(0, a, b, c, d))\n",
    "print(\"Estimated: \", r1[0], r0[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0\n",
      "0.0006666666666666666\n",
      "0.00125\n",
      "0.0015833333333333333\n",
      "0.0025833333333333333\n",
      "0.0035\n",
      "0.004583333333333333\n",
      "0.006333333333333333\n",
      "0.007416666666666667\n",
      "0.007833333333333333\n",
      "0.008916666666666666\n",
      "0.00925\n",
      "0.009916666666666667\n",
      "0.010833333333333334\n",
      "0.012333333333333333\n",
      "0.0135\n",
      "0.01425\n",
      "0.015166666666666667\n",
      "0.016\n",
      "0.017\n",
      "0.017666666666666667\n",
      "0.01875\n",
      "0.02\n",
      "0.021083333333333332\n",
      "0.022416666666666668\n",
      "0.0235\n",
      "0.024083333333333335\n",
      "0.025166666666666667\n",
      "0.026166666666666668\n",
      "0.02775\n",
      "0.029\n",
      "0.029916666666666668\n",
      "0.03158333333333333\n",
      "0.03283333333333333\n",
      "0.03383333333333333\n",
      "0.034833333333333334\n",
      "0.036\n",
      "0.037\n",
      "0.03808333333333333\n",
      "0.03941666666666667\n",
      "0.04058333333333333\n",
      "0.042083333333333334\n",
      "0.043666666666666666\n",
      "0.044583333333333336\n",
      "0.046\n",
      "0.04716666666666667\n",
      "0.04841666666666666\n",
      "0.050166666666666665\n",
      "0.0515\n",
      "0.053\n",
      "0.05491666666666667\n",
      "0.05575\n",
      "0.05708333333333333\n",
      "0.058166666666666665\n",
      "0.059416666666666666\n",
      "0.06091666666666667\n",
      "0.06233333333333333\n",
      "0.06383333333333334\n",
      "0.06491666666666666\n",
      "0.06575\n",
      "0.06725\n",
      "0.06891666666666667\n",
      "0.07\n",
      "0.07141666666666667\n",
      "0.07283333333333333\n",
      "0.07425\n",
      "0.07691666666666666\n",
      "0.07791666666666666\n",
      "0.07933333333333334\n",
      "0.0805\n",
      "0.08225\n",
      "0.08425\n",
      "0.08541666666666667\n",
      "0.08666666666666667\n",
      "0.08841666666666667\n",
      "0.08975\n",
      "0.09141666666666666\n",
      "0.09266666666666666\n",
      "0.09391666666666666\n",
      "0.095\n",
      "0.09633333333333334\n",
      "0.09791666666666667\n",
      "0.09916666666666667\n",
      "0.10066666666666667\n",
      "0.10158333333333333\n",
      "0.10358333333333333\n",
      "0.10466666666666667\n",
      "0.10608333333333334\n",
      "0.1075\n",
      "0.10908333333333334\n",
      "0.11033333333333334\n",
      "0.11216666666666666\n",
      "0.1135\n",
      "0.11525\n",
      "0.1175\n",
      "0.12008333333333333\n",
      "0.12133333333333333\n",
      "0.12308333333333334\n",
      "0.12475\n",
      "0.12741666666666668\n",
      "0.12941666666666668\n"
     ]
    }
   ],
   "source": [
    "r_, x_, t_, y_ = generateData()\n",
    "\n",
    "\n",
    "def bailIndicator(r, y_model, x_train, x_test):\n",
    "    '''\n",
    "    Indicator function for whether a judge will bail or jail a suspect.\n",
    "    \n",
    "    Algorithm:\n",
    "    ----------\n",
    "    \n",
    "    (1) Calculate recidivism probabilities from training set with a trained \n",
    "        model and assign them to predictions_train.\n",
    "    \n",
    "    (2) Calculate recidivism probabilities from test set with the trained \n",
    "        model and assign them to predictions_test.\n",
    "    \n",
    "    (3) Construct a cumulative distribution function of the probabilities in\n",
    "        in predictions_train.\n",
    "    \n",
    "    (4)\n",
    "    For pred in predictions_test:\n",
    "        \n",
    "        if pred belongs to a percentile (computed from step (3)) lower than r\n",
    "            return True\n",
    "        else\n",
    "            return False\n",
    "    \n",
    "    Returns:\n",
    "    --------\n",
    "    (1) Boolean list indicating a bail decision (bail = True).\n",
    "    '''\n",
    "\n",
    "    predictions_train = y_model.predict_proba(x_train)[:, 0]\n",
    "\n",
    "    predictions_test = y_model.predict_proba(x_test)[:, 0]\n",
    "\n",
    "    return [\n",
    "        scs.percentileofscore(predictions_train, pred, kind='weak') < r\n",
    "        for pred in predictions_test\n",
    "    ]\n",
    "\n",
    "\n",
    "recidivated = y_ == 0\n",
    "\n",
    "for r__ in range(101):\n",
    "    released_for_bail = bailIndicator(r__, lr_y, np.array([t, x]).T, np.array([t_, x_]).T)\n",
    "\n",
    "    print(np.sum(recidivated & released_for_bail) / y_.shape[0])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}