Bachelors_thesis_analyses.ipynb

      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.distplot(compas.age)\n",
    "plt.title(\"Histogram of defendants' ages\")\n",
    "plt.xlabel(\"Age of defendant\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>is_recid</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>age_cat</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>25 - 45</th>\n",
       "      <td>1784</td>\n",
       "      <td>1748</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than 45</th>\n",
       "      <td>847</td>\n",
       "      <td>446</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Less than 25</th>\n",
       "      <td>551</td>\n",
       "      <td>796</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "is_recid            0     1\n",
       "age_cat                    \n",
       "25 - 45          1784  1748\n",
       "Greater than 45   847   446\n",
       "Less than 25      551   796"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>is_recid</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sex</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Female</th>\n",
       "      <td>740</td>\n",
       "      <td>435</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Male</th>\n",
       "      <td>2442</td>\n",
       "      <td>2555</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "is_recid     0     1\n",
       "sex                 \n",
       "Female     740   435\n",
       "Male      2442  2555"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>is_recid</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>race</th>\n",
       "      <th>age_cat</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">African-American</th>\n",
       "      <th>25 - 45</th>\n",
       "      <td>847.0</td>\n",
       "      <td>1051.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than 45</th>\n",
       "      <td>261.0</td>\n",
       "      <td>207.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Less than 25</th>\n",
       "      <td>294.0</td>\n",
       "      <td>515.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">Asian</th>\n",
       "      <th>25 - 45</th>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than 45</th>\n",
       "      <td>7.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Less than 25</th>\n",
       "      <td>4.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">Caucasian</th>\n",
       "      <th>25 - 45</th>\n",
       "      <td>620.0</td>\n",
       "      <td>508.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than 45</th>\n",
       "      <td>442.0</td>\n",
       "      <td>186.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Less than 25</th>\n",
       "      <td>167.0</td>\n",
       "      <td>180.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">Hispanic</th>\n",
       "      <th>25 - 45</th>\n",
       "      <td>180.0</td>\n",
       "      <td>111.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than 45</th>\n",
       "      <td>81.0</td>\n",
       "      <td>28.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Less than 25</th>\n",
       "      <td>51.0</td>\n",
       "      <td>58.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">Native American</th>\n",
       "      <th>25 - 45</th>\n",
       "      <td>5.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than 45</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Less than 25</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">Other</th>\n",
       "      <th>25 - 45</th>\n",
       "      <td>122.0</td>\n",
       "      <td>72.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Greater than 45</th>\n",
       "      <td>56.0</td>\n",
       "      <td>19.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Less than 25</th>\n",
       "      <td>35.0</td>\n",
       "      <td>39.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "is_recid                              0       1\n",
       "race             age_cat                       \n",
       "African-American 25 - 45          847.0  1051.0\n",
       "                 Greater than 45  261.0   207.0\n",
       "                 Less than 25     294.0   515.0\n",
       "Asian            25 - 45           10.0     4.0\n",
       "                 Greater than 45    7.0     4.0\n",
       "                 Less than 25       4.0     2.0\n",
       "Caucasian        25 - 45          620.0   508.0\n",
       "                 Greater than 45  442.0   186.0\n",
       "                 Less than 25     167.0   180.0\n",
       "Hispanic         25 - 45          180.0   111.0\n",
       "                 Greater than 45   81.0    28.0\n",
       "                 Less than 25      51.0    58.0\n",
       "Native American  25 - 45            5.0     2.0\n",
       "                 Greater than 45    NaN     2.0\n",
       "                 Less than 25       NaN     2.0\n",
       "Other            25 - 45          122.0    72.0\n",
       "                 Greater than 45   56.0    19.0\n",
       "                 Less than 25      35.0    39.0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tab = compas.groupby(['age_cat', 'is_recid']).size()\n",
    "display(tab.unstack())\n",
    "\n",
    "tab = compas.groupby(['sex', 'is_recid']).size()\n",
    "display(tab.unstack())\n",
    "\n",
    "tab = compas.groupby(['race', 'age_cat', 'is_recid']).size()\n",
    "display(tab.unstack())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From above it is clear that there are no Native American recidivists of age over 45 or under 25. There are some other value combinations that might be problematic."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Synthetic data\n",
    "\n",
    "In the chunk below, we generate the synthetic data as described by Lakkaraju et al. The default values are as per their description. The definitions of $Y$ and $T$ values follow their description.\n",
    "\n",
    "**Parameters**\n",
    "\n",
    "* M = `nJudges_M`, number of judges\n",
    "* N = `nSubjects_N`, number of subjects assigned to each judge\n",
    "* betas $\\beta_i$ = `beta_i`, where $i \\in \\{X, Z, W\\}$ are coefficients for the respected variables\n",
    "* R = `acceptanceRate_R`, acceptance rates\n",
    "* X = `X`, invidual's features observable to all (models and judges)\n",
    "* Z = `Z`, information observable for judges only\n",
    "* W = `W`, unobservable / inaccessible information\n",
    "* T = `decision_T`, decisions where $T=0$ represents decision to deny and if $T=1$ then bail is granted.\n",
    "* Y = `result_Y`, result variable, if $Y=0$ person will or would recidivate and if $Y=1$ person would not commit a crime."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>judgeID_J</th>\n",
       "      <td>50000.0</td>\n",
       "      <td>49.500000</td>\n",
       "      <td>28.866359</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>24.750000</td>\n",
       "      <td>49.500000</td>\n",
       "      <td>74.250000</td>\n",
       "      <td>99.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>acceptanceRate_R</th>\n",
       "      <td>50000.0</td>\n",
       "      <td>0.478235</td>\n",
       "      <td>0.230644</td>\n",
       "      <td>0.103756</td>\n",
       "      <td>0.264643</td>\n",
       "      <td>0.473985</td>\n",
       "      <td>0.647587</td>\n",
       "      <td>0.890699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>X</th>\n",
       "      <td>50000.0</td>\n",
       "      <td>-0.003875</td>\n",
       "      <td>0.996715</td>\n",
       "      <td>-4.659953</td>\n",
       "      <td>-0.671782</td>\n",
       "      <td>-0.001726</td>\n",
       "      <td>0.668077</td>\n",
       "      <td>3.831790</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Z</th>\n",
       "      <td>50000.0</td>\n",
       "      <td>0.006964</td>\n",
       "      <td>0.998001</td>\n",
       "      <td>-4.852118</td>\n",
       "      <td>-0.666258</td>\n",
       "      <td>0.004730</td>\n",
       "      <td>0.679477</td>\n",
       "      <td>4.241772</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>W</th>\n",
       "      <td>50000.0</td>\n",
       "      <td>0.010863</td>\n",
       "      <td>0.996944</td>\n",
       "      <td>-4.029138</td>\n",
       "      <td>-0.666574</td>\n",
       "      <td>0.012306</td>\n",
       "      <td>0.679578</td>\n",
       "      <td>4.285856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>result_Y</th>\n",
       "      <td>50000.0</td>\n",
       "      <td>0.496500</td>\n",
       "      <td>0.499993</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>probabilities_T</th>\n",
       "      <td>50000.0</td>\n",
       "      <td>0.500794</td>\n",
       "      <td>0.279762</td>\n",
       "      <td>-0.335627</td>\n",
       "      <td>0.276723</td>\n",
       "      <td>0.501317</td>\n",
       "      <td>0.723352</td>\n",
       "      <td>1.295719</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>decision_T</th>\n",
       "      <td>50000.0</td>\n",
       "      <td>0.477260</td>\n",
       "      <td>0.499488</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    count       mean        std       min        25%  \\\n",
       "judgeID_J         50000.0  49.500000  28.866359  0.000000  24.750000   \n",
       "acceptanceRate_R  50000.0   0.478235   0.230644  0.103756   0.264643   \n",
       "X                 50000.0  -0.003875   0.996715 -4.659953  -0.671782   \n",
       "Z                 50000.0   0.006964   0.998001 -4.852118  -0.666258   \n",
       "W                 50000.0   0.010863   0.996944 -4.029138  -0.666574   \n",
       "result_Y          50000.0   0.496500   0.499993  0.000000   0.000000   \n",
       "probabilities_T   50000.0   0.500794   0.279762 -0.335627   0.276723   \n",
       "decision_T        50000.0   0.477260   0.499488  0.000000   0.000000   \n",
       "\n",
       "                        50%        75%        max  \n",
       "judgeID_J         49.500000  74.250000  99.000000  \n",
       "acceptanceRate_R   0.473985   0.647587   0.890699  \n",
       "X                 -0.001726   0.668077   3.831790  \n",
       "Z                  0.004730   0.679477   4.241772  \n",
       "W                  0.012306   0.679578   4.285856  \n",
       "result_Y           0.000000   1.000000   1.000000  \n",
       "probabilities_T    0.501317   0.723352   1.295719  \n",
       "decision_T         0.000000   1.000000   1.000000  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{tabular}{lrrrrrrrr}\n",
      "\\hline\n",
      "                  &    Määrä &   Keskiarvo &   Keskihajonta &   Minimi &   25\\% &   50\\% &   75\\% &   Maksimi \\\\\n",
      "\\hline\n",
      " judgeID\\_J        & 50000.00 &       49.50 &          28.87 &     0.00 & 24.75 & 49.50 & 74.25 &     99.00 \\\\\n",
      " acceptanceRate\\_R & 50000.00 &        0.48 &           0.23 &     0.10 &  0.26 &  0.47 &  0.65 &      0.89 \\\\\n",
      " X                & 50000.00 &       -0.00 &           1.00 &    -4.66 & -0.67 & -0.00 &  0.67 &      3.83 \\\\\n",
      " Z                & 50000.00 &        0.01 &           1.00 &    -4.85 & -0.67 &  0.00 &  0.68 &      4.24 \\\\\n",
      " W                & 50000.00 &        0.01 &           1.00 &    -4.03 & -0.67 &  0.01 &  0.68 &      4.29 \\\\\n",
      " result\\_Y         & 50000.00 &        0.50 &           0.50 &     0.00 &  0.00 &  0.00 &  1.00 &      1.00 \\\\\n",
      " probabilities\\_T  & 50000.00 &        0.50 &           0.28 &    -0.34 &  0.28 &  0.50 &  0.72 &      1.30 \\\\\n",
      " decision\\_T       & 50000.00 &        0.48 &           0.50 &     0.00 &  0.00 &  0.00 &  1.00 &      1.00 \\\\\n",
      "\\hline\n",
      "\\end{tabular}\n",
      "0    26137\n",
      "1    23863\n",
      "Name: decision_T, dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>decision_T</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>result_Y</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0.0</th>\n",
       "      <td>20083</td>\n",
       "      <td>5092</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1.0</th>\n",
       "      <td>6054</td>\n",
       "      <td>18771</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "decision_T      0      1\n",
       "result_Y                \n",
       "0.0         20083   5092\n",
       "1.0          6054  18771"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Set seed for reproducibility\n",
    "npr.seed(0)\n",
    "\n",
    "\n",
    "def generateData(nJudges_M=100,\n",
    "                 nSubjects_N=500,\n",
    "                 beta_X=1.0,\n",
    "                 beta_Z=1.0,\n",
    "                 beta_W=0.2):\n",
    "\n",
    "    # Assign judge IDs as running numbering from 0 to nJudges_M - 1\n",
    "    judgeID_J = np.repeat(np.arange(0, nJudges_M, dtype=np.int32), nSubjects_N)\n",
    "\n",
    "    # Sample acceptance rates uniformly from a closed interval\n",
    "    # from 0.1 to 0.9 and round to tenth decimal place.\n",
    "    acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 10)\n",
    "\n",
    "    # Replicate the rates so they can be attached to the corresponding judge ID.\n",
    "    acceptanceRate_R = np.repeat(acceptance_rates, nSubjects_N)\n",
    "\n",
    "    # Sample the variables from standard Gaussian distributions.\n",
    "    X = npr.normal(size=nJudges_M * nSubjects_N)\n",
    "    Z = npr.normal(size=nJudges_M * nSubjects_N)\n",
    "    W = npr.normal(size=nJudges_M * nSubjects_N)\n",
    "\n",
    "    probabilities_Y = 1 / (1 + np.exp(-(beta_X * X + beta_Z * Z + beta_W * W)))\n",
    "\n",
    "    # 0 if P(Y = 0| X = x; Z = z; W = w) >= 0.5 , 1 otherwise\n",
    "    result_Y = 1 - probabilities_Y.round()\n",
    "\n",
    "    probabilities_T = 1 / (1 + np.exp(-(beta_X * X + beta_Z * Z)))\n",
    "    probabilities_T += npr.normal(0, .1, nJudges_M * nSubjects_N)\n",
    "\n",
    "    # Initialize decision values as 1\n",
    "    decision_T = np.ones(nJudges_M * nSubjects_N)\n",
    "\n",
    "    # Initialize the dataframe\n",
    "    df_init = pd.DataFrame(\n",
    "        np.column_stack((judgeID_J, acceptanceRate_R, X, Z, W, result_Y,\n",
    "                         probabilities_T, decision_T)),\n",
    "        columns=[\n",
    "            \"judgeID_J\", \"acceptanceRate_R\", \"X\", \"Z\", \"W\", \"result_Y\",\n",
    "            \"probabilities_T\", \"decision_T\"\n",
    "        ])\n",
    "\n",
    "    # Sort by judges then probabilities\n",
    "    data = df_init.sort_values(\n",
    "        by=[\"judgeID_J\", \"probabilities_T\"], ascending=False)\n",
    "\n",
    "    # Iterate over the data. Subject is in the top (1-r)*100% if\n",
    "    # his within-judge-index is over acceptance threshold times\n",
    "    # the number of subjects assigned to each judge. If subject\n",
    "    # is over the limit they are assigned a zero, else one.\n",
    "    data.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    data['decision_T'] = np.where(\n",
    "        (data.index.values % nSubjects_N) <\n",
    "        ((1 - data['acceptanceRate_R']) * nSubjects_N), 0, 1)\n",
    "\n",
    "    return data\n",
    "\n",
    "\n",
    "df = []\n",
    "df = generateData()\n",
    "\n",
    "# Basic stats of the created data set.\n",
    "display(df.describe().T)\n",
    "\n",
    "\n",
    "print(df.decision_T.value_counts())\n",
    "\n",
    "tab = df.groupby(['result_Y', 'decision_T']).size()\n",
    "display(tab.unstack())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000, 8)\n",
      "(25000, 8)\n",
      "(11866, 8)\n",
      "(11997, 8)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>decision_T</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>result_Y</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0.0</th>\n",
       "      <td>2495</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1.0</th>\n",
       "      <td>9371</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "decision_T     1\n",
       "result_Y        \n",
       "0.0         2495\n",
       "1.0         9371"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Split the data set to test and train\n",
    "from sklearn.model_selection import train_test_split\n",
    "train, test = train_test_split(df, test_size=0.5, random_state=0)\n",
    "\n",
    "print(train.shape)\n",
    "print(test.shape)\n",
    "\n",
    "train_labeled = train[train.decision_T == 1]\n",
    "test_labeled = test[test.decision_T == 1]\n",
    "\n",
    "print(train_labeled.shape)\n",
    "print(test_labeled.shape)\n",
    "\n",
    "tab = train_labeled.groupby(['result_Y', 'decision_T']).size()\n",
    "tab.unstack()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Algorithms\n",
    "\n",
    "### Contraction algorithm\n",
    "\n",
    "Below is an implementation of Lakkaraju's team's algorithm presented in [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066). Relevant parameters to be passed to the function are presented in the description."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def contraction(df,\n",
    "                judgeIDJ_col,\n",
    "                decisionT_col,\n",
    "                resultY_col,\n",
    "                modelProbS_col,\n",
    "                accRateR_col,\n",
    "                r,\n",
    "                binning=False):\n",
    "    '''\n",
    "    This is an implementation of the algorithm presented by Lakkaraju\n",
    "    et al. in their paper \"The Selective Labels Problem: Evaluating \n",
    "    Algorithmic Predictions in the Presence of Unobservables\" (2017).\n",
    "    \n",
    "    Parameters:\n",
    "    df = The (Pandas) data frame containing the data, judge decisions,\n",
    "    judge IDs, results and probability scores.\n",
    "    judgeIDJ_col = String, the name of the column containing the judges' IDs\n",
    "    in df.\n",
    "    decisionT_col = String, the name of the column containing the judges' decisions\n",
    "    resultY_col = String, the name of the column containing the realization\n",
    "    modelProbS_col = String, the name of the column containing the probability\n",
    "    scores from the black-box model B.\n",
    "    accRateR_col = String, the name of the column containing the judges' \n",
    "    acceptance rates\n",
    "    r = Float between 0 and 1, the given acceptance rate.\n",
    "    binning = Boolean, should judges with same acceptance rate be binned\n",
    "    \n",
    "    Returns:\n",
    "    u = The estimated failure rate at acceptance rate r.\n",
    "    '''\n",
    "    # Sort first by acceptance rate and judge ID.\n",
    "    sorted_df = df.sort_values(\n",
    "        by=[accRateR_col, judgeIDJ_col], ascending=False)\n",
    "\n",
    "    if binning:\n",
    "        # Get maximum leniency\n",
    "        max_leniency = sorted_df[accRateR_col].values[0].round(1)\n",
    "\n",
    "        # Get list of judges that are the most lenient\n",
    "        most_lenient_list = sorted_df.loc[sorted_df[accRateR_col].round(1) ==\n",
    "                                          max_leniency, judgeIDJ_col]\n",
    "\n",
    "        # Subset to obtain D_q\n",
    "        D_q = sorted_df[sorted_df[judgeIDJ_col].isin(\n",
    "            most_lenient_list.unique())]\n",
    "    else:\n",
    "        # Get most lenient judge\n",
    "        most_lenient_ID = sorted_df[judgeIDJ_col].values[0]\n",
    "\n",
    "        # Subset\n",
    "        D_q = sorted_df[sorted_df[judgeIDJ_col] == most_lenient_ID]\n",
    "\n",
    "    R_q = D_q[D_q[decisionT_col] == 1]\n",
    "\n",
    "    R_sort_q = R_q.sort_values(by=modelProbS_col, ascending=False)\n",
    "\n",
    "    number_to_remove = int(\n",
    "        np.round((1 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))\n",
    "\n",
    "    R_B = R_sort_q[number_to_remove:R_sort_q.shape[0]]\n",
    "\n",
    "    return np.sum(R_B[resultY_col] == 0) / D_q.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Causal model\n",
    "\n",
    "Our model is defined by the probabilistic expression \n",
    "\n",
    "\\begin{equation}\\label{model}\n",
    "P(Y=0 | \\text{do}(R=r)) = \\sum_x \\underbrace{P(Y=0|X=x, T=1)}_\\text{1} \n",
    "\\overbrace{P(T=1|R=r, X=x)}^\\text{2} \n",
    "\\underbrace{P(X=x)}_\\text{3}\n",
    "\\end{equation}\n",
    "\n",
    "As a picture (Z not in model):\n",
    "\n",
    "![Model as picture](../figures/intervention_model.png \"Intervention model\")\n",
    "\n",
    "**Algorithm**\n",
    "\n",
    "Our model will be constructed sequentially.\n",
    "\n",
    "Input: Data $(\\mathbf{x}, t, y) \\in \\mathcal{D}$ and acceptance rate $r$.  \n",
    "Returns: $P(Y=0 | \\text{do}(R=r))$\n",
    "\n",
    "Procedure:\n",
    "1. Model $P(X=x)$ in a suitable way and assign to $\\mathcal{M}_0$\n",
    "*  Build model $\\mathcal{M}_1$ predicting response $Y$ with predictors $X$ from the labeled observations (where $T=1$) in training data.\n",
    "*  Predict $P(Y=0|X=x)$ for every observation in the test data using model $\\mathcal{M}_1$.\n",
    "*  Order all observations by the probability score from previous phase in ascending order. Now the \"most harmless\" are at the top of the list.\n",
    "*  Initialize `sum = 0`\n",
    "*  For every point in the parameter space (for every $x$ in $X$)\n",
    "    1. $p_x \\leftarrow P(X=x)$ from $\\mathcal{M}_0$\n",
    "    *  $\\mathcal{D_x} \\leftarrow \\{\\mathcal{D} | X = x\\}$\n",
    "    *  Assign first $r\\cdot 100\\%$ observations from $\\mathcal{D_x}$ to $\\mathcal{D}_{rx}$\n",
    "    *  $p_t \\leftarrow \\dfrac{|\\{\\mathcal{D}_{rx}|T=1\\}|}{|\\mathcal{D}_{rx}|}$\n",
    "    *  $p_y$ will be predicted from the model $\\mathcal{M}_1$\n",
    "    *  `sum +=` $p_y \\cdot p_t \\cdot p_x$\n",
    "* Return `sum`\n",
    "\n",
    "**Constructing $\\mathcal{M}_0$, preliminary ideas:**\n",
    "\n",
    "* Approximate $P(X=x)$ with frequencies (make variables factors first)\n",
    "\n",
    "TEE ALLA OLEVASTA FUNKTIO!!! Kaikki sklearnin mallit implementtaa fitin ja predictin https://scikit-learn.org/stable/tutorial/basic/tutorial.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.13017179 0.1265672  0.12309921 0.12101056 0.11946303 0.1163152\n",
      " 0.11309497 0.10995003]\n"
     ]
    }
   ],
   "source": [
    "r_list = np.linspace(0.1, 0.8, 8, endpoint=True)\n",
    "\n",
    "# 1 For now we don't model f(x),  just utilize it's N(0, 1)\n",
    "\n",
    "# 2\n",
    "lr_causal_result = LogisticRegression(solver='lbfgs')\n",
    "\n",
    "# fit, reshape X to be of shape (n_samples, n_features)\n",
    "lr_causal_result.fit(train_labeled['X'].values.reshape(-1, 1),\n",
    "                     train_labeled.result_Y)\n",
    "\n",
    "# 3\n",
    "causal_probs = lr_causal_result.predict_proba(test.X.values.reshape(-1, 1))\n",
    "\n",
    "test = test.assign(B_prob_0_causal=causal_probs[:, 0])\n",
    "\n",
    "# 4\n",
    "test_ordered = test.sort_values(by='B_prob_0_causal', ascending=True)\n",
    "\n",
    "#5\n",
    "probability_list = np.zeros_like(r_list)\n",
    "i = 0\n",
    "for r in r_list:\n",
    "\n",
    "    # 6\n",
    "    for x in range(-5, 5):\n",
    "        # A\n",
    "        p_x = scs.norm.pdf(x)\n",
    "        # B\n",
    "        D_x = test_ordered[test_ordered.X.round(0) == x]\n",
    "\n",
    "        if D_x.shape[0] == 0:\n",
    "            continue\n",
    "        # C\n",
    "        if round(r * D_x.shape[0]) == 0:\n",
    "            continue\n",
    "\n",
    "        D_rx = D_x[0:int(round(r * D_x.shape[0]))]\n",
    "\n",
    "        # D\n",
    "        p_t = np.sum(D_rx.decision_T == 1) / D_rx.shape[0]\n",
    "        # E\n",
    "        p_y = lr_causal_result.predict_proba(np.array(x).reshape(-1, 1))\n",
    "        # F\n",
    "        probability_list[i] += p_y[0, 0] * p_t * p_x\n",
    "\n",
    "    i += 1\n",
    "\n",
    "# 7\n",
    "print(probability_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Performance comparison\n",
    "\n",
    "Below we try to replicate the results obtained by Lakkaraju and compare their model's performance to the one of ours.\n",
    "\n",
    "### Synthetic data\n",
    "\n",
    "#### Predictive models\n",
    "\n",
    "Lakkaraju says that they used logistic regression to predict recidivism. We train the model using only *observed observations*, i.e. defendants that were granted bail and are in the train set. We then predict the probability of recidivism for all observations in the test data and attach it to our data set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# instantiate the model (using the default parameters)\n",
    "logreg_machine = LogisticRegression(solver='lbfgs')\n",
    "\n",
    "# fit, reshape X to be of shape (n_samples, n_features)\n",
    "logreg_machine.fit(\n",
    "    train_labeled.X.values.reshape(-1, 1), train_labeled.result_Y)\n",
    "\n",
    "# predict probabilities and attach to data\n",
    "label_probabilities_machine = logreg_machine.predict_proba(\n",
    "    test.X.values.reshape(-1, 1))\n",
    "\n",
    "test = test.assign(B_prob_0_machine=label_probabilities_machine[:, 0])\n",
    "\n",
    "\n",
    "clf = tree.DecisionTreeClassifier()\n",
    "clf = clf.fit(train_labeled.X.values.reshape(-1, 1), train_labeled.result_Y)\n",
    "\n",
    "preds = clf.predict_proba(test.X.values.reshape(-1, 1))\n",
    "\n",
    "test = test.assign(B_prob_0_tree=preds[:, 0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Causal model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#probs = causal()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Visual comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,