derivation_validation.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.stats as scs\n",
    "import scipy.integrate as si\n",
    "import seaborn as sns\n",
    "import numpy.random as npr\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "\n",
    "# Settings\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "plt.rcParams.update({'font.size': 16})\n",
    "plt.rcParams.update({'figure.figsize': (14, 7)})\n",
    "\n",
    "# Suppress deprecation warnings.\n",
    "\n",
    "import warnings\n",
    "\n",
    "\n",
    "def fxn():\n",
    "    warnings.warn(\"deprecated\", DeprecationWarning)\n",
    "\n",
    "\n",
    "with warnings.catch_warnings():\n",
    "    warnings.simplefilter(\"ignore\")\n",
    "    fxn()\n",
    "    \n",
    "def sigmoid(x):\n",
    "    '''Return value of sigmoid function (inverse of logit) at x.'''\n",
    "\n",
    "    return 1 / (1 + np.exp(-x))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the code below\n",
    "\n",
    "$$P(Y=0|do(R=r))=\\int_x P(Y=0|T=1, X=x)P(T=1|R=r, X=x)f_x(x)~dx$$\n",
    "\n",
    "where $f_x(x)$ is the pdf of x. (Now $X \\sim U(0, 1)$, so $f_x(x)=1$.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bias: (0.07131297477417739, 7.917330654880471e-16)\n",
      "Bias: (0.035957051207116175, 3.9920346147766154e-16)\n",
      "Analytical: 0.03709585053394618\n",
      "Estimated:  0.035378945320598335\n",
      "Difference: 0.001716905213347844\n",
      "\n",
      "Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\n",
      "\n",
      "Analytical: 0.14973849934787756 0.11264264881393138\n",
      "Estimated:  0.1432745402973986 0.10789559497680028\n"
     ]
    }
   ],
   "source": [
    "###\n",
    "# Synthetic data 1\n",
    "\n",
    "# R ~ U(0,1)\n",
    "# X ~ N(0,1)\n",
    "# T ~ Bin(1, sigmoid(a * r + b * x))\n",
    "# Y ~ Bin(1, sigmoid(c * t + d * x))\n",
    "\n",
    "# Weights:\n",
    "# R -> T: a\n",
    "# X -> T: b\n",
    "# T -> Y: c\n",
    "# X -> Y: d\n",
    "\n",
    "N = 12000\n",
    "a = 1\n",
    "b = 1\n",
    "c = 1\n",
    "d = 1\n",
    "\n",
    "\n",
    "def generateData(N=N, a=a, b=b, c=c, d=d):\n",
    "\n",
    "    r = npr.uniform(size=N)\n",
    "    x = npr.uniform(size=N)\n",
    "\n",
    "    t = npr.binomial(n=1, p=sigmoid(a * r + b * x), size=N)\n",
    "\n",
    "    y = npr.binomial(n=1, p=sigmoid(c * t + d * x), size=N)\n",
    "\n",
    "    y[t == 0] = 1\n",
    "\n",
    "    return r, x, t, y\n",
    "\n",
    "\n",
    "def analytic_R_on_Y(r, a, b, c, d):\n",
    "    '''Return probability of Y=0 with given parameters.'''\n",
    "\n",
    "    # P(y=0|t=1, x) * p(t=1|r, x) * p(x)\n",
    "    t_1 = si.quad(\n",
    "        lambda x: (1 - sigmoid(c * 1 + d * x)) * sigmoid(a * r + b * x) * 1, 0,\n",
    "        1)\n",
    "\n",
    "    # P(y=0|t=0, x) * p(t=0|r, x) * p(x)\n",
    "    # Now equal to 0, hence commented out. See data generation.\n",
    "    t_0 = si.quad(\n",
    "        lambda x: (1 - sigmoid(c * 0 + d * x)) * (1 - sigmoid(a * r + b * x)) *\n",
    "        1, 0, 1)\n",
    "\n",
    "    return t_1[0]  #+ t_0[0]\n",
    "\n",
    "\n",
    "r, x, t, y = generateData()\n",
    "\n",
    "# Fit predictive models.\n",
    "lr_t = LogisticRegression(solver='lbfgs')\n",
    "lr_y = LogisticRegression(solver='lbfgs')\n",
    "\n",
    "lr_t = lr_t.fit(np.array([r, x]).T, t)\n",
    "lr_y = lr_y.fit(np.array([t[t == 1], x[t == 1]]).T, y[t == 1])\n",
    "\n",
    "\n",
    "def causal_effect_of_R_on_Y(r):\n",
    "\n",
    "    # Equal to integration of P(Y=0|T=0, X=x) * P(T=0|R=r, X=x) * f(x)\n",
    "    # from 0 to 1\n",
    "    print(\"Bias:\",\n",
    "        si.quad(\n",
    "            lambda x: lr_y.predict_proba(np.array([[0, x]]))[0, 0] * lr_t.\n",
    "            predict_proba(np.array([[r, x]]))[0, 0], 0, 1))\n",
    "\n",
    "    # Integrate P(Y=0|T=1, X=x) * P(T=1|R=r, X=x) * f(x) from 0 to 1\n",
    "    return (si.quad(\n",
    "        lambda x: lr_y.predict_proba(np.array([[1, x]]))[0, 0] * lr_t.\n",
    "        predict_proba(np.array([[r, x]]))[0, 1], 0, 1))\n",
    "\n",
    "\n",
    "r0 = causal_effect_of_R_on_Y(0)\n",
    "r1 = causal_effect_of_R_on_Y(1)\n",
    "\n",
    "analytical = analytic_R_on_Y(1, a, b, c, d) - analytic_R_on_Y(0, a, b, c, d)\n",
    "\n",
    "print(\"Analytical:\", analytical)\n",
    "print(\"Estimated: \", r1[0] - r0[0])\n",
    "print(\"Difference:\", analytical - r1[0] + r0[0])\n",
    "print()\n",
    "print(\"Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\\n\")\n",
    "print(\"Analytical:\", analytic_R_on_Y(1, a, b, c, d),\n",
    "      analytic_R_on_Y(0, a, b, c, d))\n",
    "print(\"Estimated: \", r1[0], r0[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bias: (0.048298659980495415, 5.362228436888762e-16)\n",
      "Bias: (0.02685101277580984, 2.981061261820832e-16)\n",
      "Analytical: 0.030762705619782782\n",
      "Estimated:  0.021455460896022127\n",
      "Difference: 0.009307244723760655\n",
      "\n",
      "Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\n",
      "\n",
      "Analytical: 0.16344054312353337 0.1326778375037506\n",
      "Estimated:  0.15963912402042202 0.1381836631243999\n"
     ]
    }
   ],
   "source": [
    "###\n",
    "# Synthetic data 2 - with unobservable Z\n",
    "\n",
    "# R ~ U(0,1)\n",
    "# X ~ N(0,1)\n",
    "# T ~ Bin(1, sigmoid(a * r + b * x))\n",
    "# Y ~ Bin(1, sigmoid(c * t + d * x))\n",
    "\n",
    "# Weights:\n",
    "# R -> T: a\n",
    "# X -> T: b\n",
    "# T -> Y: c\n",
    "# X -> Y: d\n",
    "# Z -> T: e\n",
    "# Z -> Y: f\n",
    "\n",
    "N = 12000\n",
    "a = 1\n",
    "b = 1\n",
    "c = .5\n",
    "d = 1\n",
    "e = 1\n",
    "f = 1\n",
    "\n",
    "def generateDataWithZ(N=N, a=a, b=b, c=c, d=d, e=e, f=f):\n",
    "\n",
    "    r = npr.uniform(size=N)\n",
    "    x = npr.uniform(size=N)\n",
    "    \n",
    "    z = npr.uniform(size=N)\n",
    "\n",
    "    t = npr.binomial(n=1, p=sigmoid(a * r + b * x + e * z), size=N)\n",
    "\n",
    "    y = npr.binomial(n=1, p=sigmoid(c * t + d * x + f * z), size=N)\n",
    "\n",
    "    y[t == 0] = 1\n",
    "\n",
    "    return r, x, z, t, y\n",
    "\n",
    "\n",
    "def analytic_R_on_Y(r, a, b, c, d, e, f):\n",
    "    '''Return probability of Y=0 with given parameters.'''\n",
    "\n",
    "    # P(y=0|t=1, x) * p(t=1|r, x) * p(x)\n",
    "    def integrable(t, x, z):\n",
    "        p_y0 = 1 - sigmoid(c * t + d * x + f * z)\n",
    "        \n",
    "        if t == 1:\n",
    "            p_t = sigmoid(a * r + b * x + e * z)\n",
    "        elif t == 0:\n",
    "            p_t = 1 - sigmoid(a * r + b * x + e * z)\n",
    "        \n",
    "        p_x = 1\n",
    "        \n",
    "        return p_y0 * p_t * p_x\n",
    "    \n",
    "    t_1 = si.dblquad(lambda x, z: integrable(1, x, z), 0, 1, lambda x: 0, lambda x: 1)\n",
    "\n",
    "    # P(y=0|t=0, x) * p(t=0|r, x) * p(x)\n",
    "    # Now equal to 0, hence commented out. See data generation.\n",
    "    t_0 = si.dblquad(lambda x, z: integrable(0, x, z), 0, 1, lambda x: 0, lambda x: 1)\n",
    "\n",
    "    return t_1[0]  #+ t_0[0]\n",
    "\n",
    "\n",
    "r, x, z, t, y = generateDataWithZ()\n",
    "\n",
    "# Fit predictive models.\n",
    "lr_t = LogisticRegression(solver='lbfgs')\n",
    "lr_y = LogisticRegression(solver='lbfgs')\n",
    "\n",
    "lr_t = lr_t.fit(np.array([r, x]).T, t)\n",
    "lr_y = lr_y.fit(np.array([t[t == 1], x[t == 1]]).T, y[t == 1])\n",
    "\n",
    "\n",
    "def causal_effect_of_R_on_Y(r):\n",
    "\n",
    "    # Equal to integration of P(Y=0|T=0, X=x) * P(T=0|R=r, X=x) * f(x)\n",
    "    # from 0 to 1\n",
    "    print(\"Bias:\",\n",
    "        si.quad(\n",
    "            lambda x: lr_y.predict_proba(np.array([[0, x]]))[0, 0] * lr_t.\n",
    "            predict_proba(np.array([[r, x]]))[0, 0], 0, 1))\n",
    "\n",
    "    # Integrate P(Y=0|T=1, X=x) * P(T=1|R=r, X=x) * f(x) from 0 to 1\n",
    "    return (si.quad(\n",
    "        lambda x: lr_y.predict_proba(np.array([[1, x]]))[0, 0] * lr_t.\n",
    "        predict_proba(np.array([[r, x]]))[0, 1], 0, 1))\n",
    "\n",
    "\n",
    "r0 = causal_effect_of_R_on_Y(0)\n",
    "r1 = causal_effect_of_R_on_Y(1)\n",
    "\n",
    "analytical = analytic_R_on_Y(1, a, b, c, d, e, f) - analytic_R_on_Y(0, a, b, c, d, e, f)\n",
    "\n",
    "print(\"Analytical:\", analytical)\n",
    "print(\"Estimated: \", r1[0] - r0[0])\n",
    "print(\"Difference:\", analytical - r1[0] + r0[0])\n",
    "print()\n",
    "print(\"Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\\n\")\n",
    "print(\"Analytical:\", analytic_R_on_Y(1, a, b, c, d, e, f),\n",
    "      analytic_R_on_Y(0, a, b, c, d, e, f))\n",
    "print(\"Estimated: \", r1[0], r0[0])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}