Skip to content
Snippets Groups Projects
derivation_validation.ipynb 10.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • {
     "cells": [
      {
       "cell_type": "code",
    
    Riku-Laine's avatar
    Riku-Laine committed
       "execution_count": 1,
    
       "metadata": {},
       "outputs": [],
       "source": [
        "# Imports\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from datetime import datetime\n",
        "import matplotlib.pyplot as plt\n",
        "import scipy.stats as scs\n",
        "import scipy.integrate as si\n",
        "import seaborn as sns\n",
        "import numpy.random as npr\n",
        "from sklearn.preprocessing import OneHotEncoder\n",
        "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.preprocessing import PolynomialFeatures\n",
        "\n",
        "\n",
        "# Settings\n",
        "\n",
        "%matplotlib inline\n",
        "\n",
        "plt.rcParams.update({'font.size': 16})\n",
        "plt.rcParams.update({'figure.figsize': (14, 7)})\n",
        "\n",
        "# Suppress deprecation warnings.\n",
        "\n",
        "import warnings\n",
        "\n",
        "\n",
        "def fxn():\n",
        "    warnings.warn(\"deprecated\", DeprecationWarning)\n",
        "\n",
        "\n",
        "with warnings.catch_warnings():\n",
        "    warnings.simplefilter(\"ignore\")\n",
        "    fxn()\n",
        "    \n",
        "def sigmoid(x):\n",
        "    '''Return value of sigmoid function (inverse of logit) at x.'''\n",
        "\n",
        "    return 1 / (1 + np.exp(-x))\n"
       ]
      },
      {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
        "In the code below\n",
        "\n",
        "$$P(Y=0|do(R=r))=\\int_x P(Y=0|T=1, X=x)P(T=1|R=r, X=x)f_x(x)~dx$$\n",
        "\n",
        "where $f_x(x)$ is the pdf of x. (Now $X \\sim U(0, 1)$, so $f_x(x)=1$.)"
       ]
      },
      {
       "cell_type": "code",
    
    Riku-Laine's avatar
    Riku-Laine committed
       "execution_count": 2,
    
       "metadata": {
        "scrolled": false
       },
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
    
          "Bias: (0.07131297477417739, 7.917330654880471e-16)\n",
          "Bias: (0.035957051207116175, 3.9920346147766154e-16)\n",
    
          "Analytical: 0.03709585053394618\n",
    
          "Estimated:  0.035378945320598335\n",
          "Difference: 0.001716905213347844\n",
    
          "\n",
          "Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\n",
          "\n",
          "Analytical: 0.14973849934787756 0.11264264881393138\n",
    
          "Estimated:  0.1432745402973986 0.10789559497680028\n"
    
         ]
        }
       ],
       "source": [
        "###\n",
    
        "# Synthetic data 1\n",
    
        "\n",
        "# R ~ U(0,1)\n",
        "# X ~ N(0,1)\n",
        "# T ~ Bin(1, sigmoid(a * r + b * x))\n",
        "# Y ~ Bin(1, sigmoid(c * t + d * x))\n",
        "\n",
        "# Weights:\n",
        "# R -> T: a\n",
        "# X -> T: b\n",
        "# T -> Y: c\n",
        "# X -> Y: d\n",
        "\n",
        "N = 12000\n",
        "a = 1\n",
        "b = 1\n",
        "c = 1\n",
        "d = 1\n",
        "\n",
        "\n",
        "def generateData(N=N, a=a, b=b, c=c, d=d):\n",
        "\n",
        "    r = npr.uniform(size=N)\n",
        "    x = npr.uniform(size=N)\n",
        "\n",
        "    t = npr.binomial(n=1, p=sigmoid(a * r + b * x), size=N)\n",
        "\n",
        "    y = npr.binomial(n=1, p=sigmoid(c * t + d * x), size=N)\n",
        "\n",
        "    y[t == 0] = 1\n",
        "\n",
        "    return r, x, t, y\n",
        "\n",
        "\n",
        "def analytic_R_on_Y(r, a, b, c, d):\n",
        "    '''Return probability of Y=0 with given parameters.'''\n",
        "\n",
        "    # P(y=0|t=1, x) * p(t=1|r, x) * p(x)\n",
        "    t_1 = si.quad(\n",
        "        lambda x: (1 - sigmoid(c * 1 + d * x)) * sigmoid(a * r + b * x) * 1, 0,\n",
        "        1)\n",
        "\n",
        "    # P(y=0|t=0, x) * p(t=0|r, x) * p(x)\n",
        "    # Now equal to 0, hence commented out. See data generation.\n",
        "    t_0 = si.quad(\n",
        "        lambda x: (1 - sigmoid(c * 0 + d * x)) * (1 - sigmoid(a * r + b * x)) *\n",
        "        1, 0, 1)\n",
        "\n",
        "    return t_1[0]  #+ t_0[0]\n",
        "\n",
        "\n",
        "r, x, t, y = generateData()\n",
        "\n",
        "# Fit predictive models.\n",
        "lr_t = LogisticRegression(solver='lbfgs')\n",
        "lr_y = LogisticRegression(solver='lbfgs')\n",
        "\n",
        "lr_t = lr_t.fit(np.array([r, x]).T, t)\n",
        "lr_y = lr_y.fit(np.array([t[t == 1], x[t == 1]]).T, y[t == 1])\n",
        "\n",
        "\n",
        "def causal_effect_of_R_on_Y(r):\n",
        "\n",
        "    # Equal to integration of P(Y=0|T=0, X=x) * P(T=0|R=r, X=x) * f(x)\n",
        "    # from 0 to 1\n",
        "    print(\"Bias:\",\n",
        "        si.quad(\n",
        "            lambda x: lr_y.predict_proba(np.array([[0, x]]))[0, 0] * lr_t.\n",
        "            predict_proba(np.array([[r, x]]))[0, 0], 0, 1))\n",
        "\n",
        "    # Integrate P(Y=0|T=1, X=x) * P(T=1|R=r, X=x) * f(x) from 0 to 1\n",
        "    return (si.quad(\n",
        "        lambda x: lr_y.predict_proba(np.array([[1, x]]))[0, 0] * lr_t.\n",
        "        predict_proba(np.array([[r, x]]))[0, 1], 0, 1))\n",
        "\n",
        "\n",
        "r0 = causal_effect_of_R_on_Y(0)\n",
        "r1 = causal_effect_of_R_on_Y(1)\n",
        "\n",
        "analytical = analytic_R_on_Y(1, a, b, c, d) - analytic_R_on_Y(0, a, b, c, d)\n",
        "\n",
        "print(\"Analytical:\", analytical)\n",
        "print(\"Estimated: \", r1[0] - r0[0])\n",
        "print(\"Difference:\", analytical - r1[0] + r0[0])\n",
        "print()\n",
        "print(\"Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\\n\")\n",
        "print(\"Analytical:\", analytic_R_on_Y(1, a, b, c, d),\n",
        "      analytic_R_on_Y(0, a, b, c, d))\n",
        "print(\"Estimated: \", r1[0], r0[0])"
       ]
    
    Riku-Laine's avatar
    Riku-Laine committed
      },
      {
       "cell_type": "code",
    
       "execution_count": 91,
    
    Riku-Laine's avatar
    Riku-Laine committed
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
    
          "Bias: (0.048298659980495415, 5.362228436888762e-16)\n",
          "Bias: (0.02685101277580984, 2.981061261820832e-16)\n",
          "Analytical: 0.030762705619782782\n",
          "Estimated:  0.021455460896022127\n",
          "Difference: 0.009307244723760655\n",
          "\n",
          "Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\n",
          "\n",
          "Analytical: 0.16344054312353337 0.1326778375037506\n",
          "Estimated:  0.15963912402042202 0.1381836631243999\n"
    
    Riku-Laine's avatar
    Riku-Laine committed
         ]
        }
       ],
       "source": [
    
        "###\n",
        "# Synthetic data 2 - with unobservable Z\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "\n",
    
        "# R ~ U(0,1)\n",
        "# X ~ N(0,1)\n",
        "# T ~ Bin(1, sigmoid(a * r + b * x))\n",
        "# Y ~ Bin(1, sigmoid(c * t + d * x))\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "\n",
    
        "# Weights:\n",
        "# R -> T: a\n",
        "# X -> T: b\n",
        "# T -> Y: c\n",
        "# X -> Y: d\n",
        "# Z -> T: e\n",
        "# Z -> Y: f\n",
        "\n",
        "N = 12000\n",
        "a = 1\n",
        "b = 1\n",
        "c = .5\n",
        "d = 1\n",
        "e = 1\n",
        "f = 1\n",
        "\n",
        "def generateDataWithZ(N=N, a=a, b=b, c=c, d=d, e=e, f=f):\n",
        "\n",
        "    r = npr.uniform(size=N)\n",
        "    x = npr.uniform(size=N)\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "    \n",
    
        "    z = npr.uniform(size=N)\n",
        "\n",
        "    t = npr.binomial(n=1, p=sigmoid(a * r + b * x + e * z), size=N)\n",
        "\n",
        "    y = npr.binomial(n=1, p=sigmoid(c * t + d * x + f * z), size=N)\n",
        "\n",
        "    y[t == 0] = 1\n",
        "\n",
        "    return r, x, z, t, y\n",
        "\n",
        "\n",
        "def analytic_R_on_Y(r, a, b, c, d, e, f):\n",
        "    '''Return probability of Y=0 with given parameters.'''\n",
        "\n",
        "    # P(y=0|t=1, x) * p(t=1|r, x) * p(x)\n",
        "    def integrable(t, x, z):\n",
        "        p_y0 = 1 - sigmoid(c * t + d * x + f * z)\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "        \n",
    
        "        if t == 1:\n",
        "            p_t = sigmoid(a * r + b * x + e * z)\n",
        "        elif t == 0:\n",
        "            p_t = 1 - sigmoid(a * r + b * x + e * z)\n",
        "        \n",
        "        p_x = 1\n",
        "        \n",
        "        return p_y0 * p_t * p_x\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "    \n",
    
        "    t_1 = si.dblquad(lambda x, z: integrable(1, x, z), 0, 1, lambda x: 0, lambda x: 1)\n",
        "\n",
        "    # P(y=0|t=0, x) * p(t=0|r, x) * p(x)\n",
        "    # Now equal to 0, hence commented out. See data generation.\n",
        "    t_0 = si.dblquad(lambda x, z: integrable(0, x, z), 0, 1, lambda x: 0, lambda x: 1)\n",
        "\n",
        "    return t_1[0]  #+ t_0[0]\n",
        "\n",
        "\n",
        "r, x, z, t, y = generateDataWithZ()\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "\n",
    
        "# Fit predictive models.\n",
        "lr_t = LogisticRegression(solver='lbfgs')\n",
        "lr_y = LogisticRegression(solver='lbfgs')\n",
        "\n",
        "lr_t = lr_t.fit(np.array([r, x]).T, t)\n",
        "lr_y = lr_y.fit(np.array([t[t == 1], x[t == 1]]).T, y[t == 1])\n",
        "\n",
        "\n",
        "def causal_effect_of_R_on_Y(r):\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "\n",
    
        "    # Equal to integration of P(Y=0|T=0, X=x) * P(T=0|R=r, X=x) * f(x)\n",
        "    # from 0 to 1\n",
        "    print(\"Bias:\",\n",
        "        si.quad(\n",
        "            lambda x: lr_y.predict_proba(np.array([[0, x]]))[0, 0] * lr_t.\n",
        "            predict_proba(np.array([[r, x]]))[0, 0], 0, 1))\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "\n",
    
        "    # Integrate P(Y=0|T=1, X=x) * P(T=1|R=r, X=x) * f(x) from 0 to 1\n",
        "    return (si.quad(\n",
        "        lambda x: lr_y.predict_proba(np.array([[1, x]]))[0, 0] * lr_t.\n",
        "        predict_proba(np.array([[r, x]]))[0, 1], 0, 1))\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "\n",
        "\n",
    
        "r0 = causal_effect_of_R_on_Y(0)\n",
        "r1 = causal_effect_of_R_on_Y(1)\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "\n",
    
        "analytical = analytic_R_on_Y(1, a, b, c, d, e, f) - analytic_R_on_Y(0, a, b, c, d, e, f)\n",
    
    Riku-Laine's avatar
    Riku-Laine committed
        "\n",
    
        "print(\"Analytical:\", analytical)\n",
        "print(\"Estimated: \", r1[0] - r0[0])\n",
        "print(\"Difference:\", analytical - r1[0] + r0[0])\n",
        "print()\n",
        "print(\"Values for P(y=0|do(r=1)) and P(y=0|do(r=0))\\n\")\n",
        "print(\"Analytical:\", analytic_R_on_Y(1, a, b, c, d, e, f),\n",
        "      analytic_R_on_Y(0, a, b, c, d, e, f))\n",
        "print(\"Estimated: \", r1[0], r0[0])"
    
    Riku-Laine's avatar
    Riku-Laine committed
       ]
    
      }
     ],
     "metadata": {
      "kernelspec": {
       "display_name": "Python 3",
       "language": "python",
       "name": "python3"
      },
      "language_info": {
       "codemirror_mode": {
        "name": "ipython",
        "version": 3
       },
       "file_extension": ".py",
       "mimetype": "text/x-python",
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
       "version": "3.7.3"
      },
      "toc": {
       "base_numbering": 1,
       "nav_menu": {},
       "number_sections": true,
       "sideBar": true,
       "skip_h1_title": false,
       "title_cell": "Table of Contents",
       "title_sidebar": "Contents",
       "toc_cell": false,
       "toc_position": {},
       "toc_section_display": true,
       "toc_window_display": false
      },
      "varInspector": {
       "cols": {
        "lenName": 16,
        "lenType": 16,
        "lenVar": 40
       },
       "kernels_config": {
        "python": {
         "delete_cmd_postfix": "",
         "delete_cmd_prefix": "del ",
         "library": "var_list.py",
         "varRefreshCmd": "print(var_dic_list())"
        },
        "r": {
         "delete_cmd_postfix": ") ",
         "delete_cmd_prefix": "rm(",
         "library": "var_list.r",
         "varRefreshCmd": "cat(var_dic_list()) "
        }
       },
       "types_to_exclude": [
        "module",
        "function",
        "builtin_function_or_method",
        "instance",
        "_Feature"
       ],
       "window_display": false
      }
     },
     "nbformat": 4,
     "nbformat_minor": 2
    }