From 823361089a2bc051fe6214ccb605d020a34105a5 Mon Sep 17 00:00:00 2001
From: Riku-Laine <28960190+Riku-Laine@users.noreply.github.com>
Date: Thu, 14 Mar 2019 10:42:35 +0200
Subject: [PATCH] Lakkaraju v1 implemented, has bugs

---
 Bachelors_thesis_analyses.ipynb | 259 ++++++++++++++++++++------------
 1 file changed, 166 insertions(+), 93 deletions(-)

diff --git a/Bachelors_thesis_analyses.ipynb b/Bachelors_thesis_analyses.ipynb
index 8e4b54e..7635956 100644
--- a/Bachelors_thesis_analyses.ipynb
+++ b/Bachelors_thesis_analyses.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Bachelors thesis: analyses\n",
+    "# Bachelors thesis' analyses\n",
     "\n",
     "*This Jupyter notebook is for the analyses and model building for Riku Laine's bachelors thesis*\n",
     "\n",
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -22,7 +22,7 @@
        "(7214, 53)"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -319,27 +319,9 @@
     "                                                                    grid = True)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Implement Lakkaraju\n",
-    "\n",
-    "*Below is an implementation of Lakkaraju's algorithm presented in XYZ (link TBA)*\n",
-    "\n",
-    "* M = number of judges\n",
-    "* subj = number of subjects assigned to each judge\n",
-    "* betas $\\beta$ are coefficients\n",
-    "* R = acceptance rates\n",
-    "* X = invidual's features observable to all (models and judges)\n",
-    "* Z = information observable for judges only\n",
-    "* W = unobservable / inaccessible information\n",
-    "* T = decisions"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -505,10 +487,10 @@
     {
      "data": {
       "text/plain": [
-       "<matplotlib.axes._subplots.AxesSubplot at 0x1d7fd04c7b8>"
+       "<matplotlib.axes._subplots.AxesSubplot at 0x2d738932400>"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -531,9 +513,27 @@
     "sns.kdeplot(np.array(compas_raw.age))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate synthetic data set\n",
+    "\n",
+    "In the chunk below, we generate the synthetic data as described by Lakkaraju et al.\n",
+    "\n",
+    "* M = number of judges\n",
+    "* subj = number of subjects assigned to each judge\n",
+    "* betas $\\beta$ are coefficients\n",
+    "* R = acceptance rates\n",
+    "* X = invidual's features observable to all (models and judges)\n",
+    "* Z = information observable for judges only\n",
+    "* W = unobservable / inaccessible information\n",
+    "* T = decisions"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -541,48 +541,48 @@
     "\n",
     "npr.seed(0)\n",
     "\n",
-    "M = 100\n",
-    "subj = 500\n",
+    "nJudges_M = 100\n",
+    "nSubjects_N = 500\n",
     "\n",
     "beta_X = 1.0\n",
     "beta_Z = 1.0\n",
     "beta_W = 0.2\n",
     "\n",
-    "judge_IDs = np.repeat(np.arange(0,M), subj)\n",
+    "judgeID_J = np.repeat(np.arange(0, nJudges_M, dtype = np.int32), nSubjects_N)\n",
     "\n",
-    "acceptance_rates = np.round(npr.uniform(.1, .9, M), 1)\n",
+    "acceptance_rates = np.round(npr.uniform(.1, .9, nJudges_M), 1)\n",
     "\n",
-    "R = np.repeat(acceptance_rates, subj)\n",
+    "acceptanceRate_R = np.repeat(acceptance_rates, nSubjects_N)\n",
     "\n",
-    "X = npr.normal(size = M * subj)\n",
-    "Z = npr.normal(size = M * subj)\n",
-    "W = npr.normal(size = M * subj)\n",
+    "X = npr.normal(size = nJudges_M * nSubjects_N)\n",
+    "Z = npr.normal(size = nJudges_M * nSubjects_N)\n",
+    "W = npr.normal(size = nJudges_M * nSubjects_N)\n",
     "\n",
     "probabilities_Y = 1 / (1 + np.exp(-(beta_X * X + beta_Z * Z + beta_W * W)))\n",
     "\n",
-    "Y = np.round(1 - probabilities_Y).astype(int)\n",
+    "result_Y = np.round(1 - probabilities_Y).astype(int)\n",
     "\n",
     "probabilities_T = 1 / (1 + np.exp(-(beta_X * X + beta_Z * Z)))\n",
-    "probabilities_T += npr.normal(.0, .1, M * subj)\n",
+    "probabilities_T += npr.normal(.0, .1, nJudges_M * nSubjects_N)\n",
     "\n",
-    "T = np.ones(M * subj)*(-1)\n",
+    "decision_T = np.zeros(nJudges_M * nSubjects_N) - 1\n",
     "\n",
-    "tmp = pd.DataFrame(np.column_stack((judge_IDs, R, X, Z, W, Y, probabilities_T, T)),\n",
-    "                   columns = [\"judge_IDs\", \"R\", \"X\", \"Z\", \"W\", \"Y\", \"probabilities_T\", \"T\"])\n",
+    "tmp = pd.DataFrame(np.column_stack((judgeID_J, acceptanceRate_R, X, Z, W, result_Y, probabilities_T, decision_T)),\n",
+    "                   columns = [\"judgeID_J\", \"acceptanceRate_R\", \"X\", \"Z\", \"W\", \"result_Y\", \"probabilities_T\", \"decision_T\"])\n",
     "\n",
     "# Sort by judges then probabilities\n",
-    "df = tmp.sort_values(by = [\"judge_IDs\", \"probabilities_T\"])\n",
+    "df = tmp.sort_values(by = [\"judgeID_J\", \"probabilities_T\"])\n",
     "\n",
     "# Iterate over the data. Subject is in the top (1-r)*100% if\n",
     "# his within-judge-index is over acceptance threshold times\n",
     "# the number of subjects assigned to each judge. If subject\n",
     "# is over the limit they are assigned a zero, else one.\n",
-    "for i in range(M * subj):\n",
-    "    index = i % subj\n",
-    "    if index >= df['R'][i] * subj:\n",
-    "        df['T'][i] = 0\n",
+    "for i in range(nJudges_M * nSubjects_N):\n",
+    "    index = i % nSubjects_N\n",
+    "    if index >= df.acceptanceRate_R[i] * nSubjects_N:\n",
+    "        df.decision_T[i] = 0\n",
     "    else:\n",
-    "        df['T'][i] = 1  # TARKISTA!!!!!!"
+    "        df.decision_T[i] = 1  # TARKISTA!!!!!!"
    ]
   },
   {
@@ -594,7 +594,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 64,
    "metadata": {
     "scrolled": true
    },
@@ -605,7 +605,7 @@
      "text": [
       "0.0    25900\n",
       "1.0    24100\n",
-      "Name: T, dtype: int64\n"
+      "Name: decision_T, dtype: int64\n"
      ]
     },
     {
@@ -628,12 +628,12 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th>T</th>\n",
+       "      <th>decision_T</th>\n",
        "      <th>0.0</th>\n",
        "      <th>1.0</th>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>Y</th>\n",
+       "      <th>result_Y</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
@@ -654,21 +654,21 @@
        "</div>"
       ],
       "text/plain": [
-       "T      0.0    1.0\n",
-       "Y                \n",
-       "0.0  13053  12122\n",
-       "1.0  12847  11978"
+       "decision_T    0.0    1.0\n",
+       "result_Y                \n",
+       "0.0         13053  12122\n",
+       "1.0         12847  11978"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 64,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "print(df['T'].value_counts())\n",
+    "print(df.decision_T.value_counts())\n",
     "\n",
-    "tab = df.groupby(['Y', 'T']).size()\n",
+    "tab = df.groupby(['result_Y', 'decision_T']).size()\n",
     "tab.unstack()"
    ]
   },
@@ -681,7 +681,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [
     {
@@ -689,7 +689,8 @@
      "output_type": "stream",
      "text": [
       "(25000, 8)\n",
-      "(25000, 8)\n"
+      "(25000, 8)\n",
+      "(12134, 8)\n"
      ]
     }
    ],
@@ -698,22 +699,18 @@
     "train, test = np.split(df.sample(frac = 1, random_state = 0), 2)\n",
     "\n",
     "print(train.shape)\n",
-    "print(test.shape)"
+    "print(test.shape)\n",
+    "\n",
+    "train_labeled = train[train.decision_T == 1]\n",
+    "\n",
+    "print(train_labeled.shape)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 66,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['judge_IDs' 'R' 'X' 'Z' 'W' 'Y' 'probabilities_T' 'T' 'B_prob_1']\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# import the class\n",
     "from sklearn.linear_model import LogisticRegression\n",
@@ -722,41 +719,117 @@
     "logreg = LogisticRegression(random_state=0, solver='lbfgs')\n",
     "\n",
     "# fit, reshape X to be of shape (n_samples, n_features)\n",
-    "logreg.fit(train.X.values.reshape(-1,1), train.Y)\n",
+    "logreg.fit(train_labeled.X.values.reshape(-1,1), train_labeled.result_Y)\n",
     "\n",
     "# predict probabilities and attach to data \n",
     "label_probabilities = logreg.predict_proba(test.X.values.reshape(-1,1))\n",
     "\n",
+    "test['B_prob_0'] = label_probabilities[:, 0]\n",
     "test['B_prob_1'] = label_probabilities[:, 1]\n",
-    "# kaks columnia, ekassa nollan tn tokassa ykkösen\n",
-    "\n"
+    "# kaks columnia, ekassa nollan tn, tokassa ykkösen"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "def lakkaraju(x, j, t, y, s, r):\n",
-    "    return 0\n",
-    "#df = df.sort_values(by = [\"R\", \"judge_IDs\"], ascending = False) # ekana isoin acc rate ja pienin tuomari id\n",
-    "#\n",
-    "#D_q = df.iloc[0:500,] # ekat 500 kuuluu sille\n",
-    "#\n",
-    "#R_q = D_q.iloc[(D_q['T'] == 1).values] # valitaan vapaalle päässeet\n",
-    "#\n",
-    "#R_sort_q = R_q.sort_values(by = [\"probabilities_T\"], ascending = False)\n",
-    "#\n",
-    "#u = np.zeros(10)\n",
-    "## Breikkaa\n",
-    "#for judges_approval in range(10):\n",
-    "#    number_to_remove = np.round((1 - judges_approval/10) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])).astype(int)\n",
-    "#    R_B = R_sort_q.head(number_to_remove)\n",
-    "#\n",
-    "#    u[judges_approval] = np.sum(R_B['Y'] == 0)/D_q.shape[0]\n",
-    "#    \n",
-    "#plt.plot(np.arange(0,1,.1), u);plt.show()"
+    "## Implement Lakkaraju (tba)\n",
+    "\n",
+    "*Below is an implementation of Lakkaraju's algorithm presented in [their paper](https://helka.finna.fi/PrimoRecord/pci.acm3098066).*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "224\n",
+      "199\n",
+      "174\n",
+      "150\n",
+      "125\n",
+      "100\n",
+      "75\n",
+      "50\n",
+      "26\n",
+      "1\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<matplotlib.lines.Line2D at 0x2d73bf4f470>]"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def contraction(df, j_name, t_name, y_name, s_name, r_name, r):\n",
+    "    '''\n",
+    "    This is an implementation of the algorithm presented by Lakkaraju\n",
+    "    et al. in their paper \"The Selective Labels Problem: Evaluating \n",
+    "    Algorithmic Predictions in the Presence of Unobservables\" (2017).\n",
+    "    \n",
+    "    Parameters:\n",
+    "    df = The (Pandas) data frame containing the data, judge decisions,\n",
+    "    judge IDs, results and probability scores.\n",
+    "    j_name = String, the name of the column containing the judges' IDs\n",
+    "    in df.\n",
+    "    t_name = String, the name of the column containing the judges' decisions\n",
+    "    s_name = String, the name of the column containing the probability\n",
+    "    scores from the black-box model B.\n",
+    "    r_name = String, the name of the column containing the judges' \n",
+    "    acceptance rates\n",
+    "    r = Float between 0 and 1, the given acceptance rate.\n",
+    "    \n",
+    "    Returns:\n",
+    "    u = The estimated failure rate at acceptance rate r.\n",
+    "    '''\n",
+    "    # Sort first by acceptance rate and judge ID.\n",
+    "    sorted_df = df.sort_values(by = [r_name, j_name], ascending = False)\n",
+    "\n",
+    "    most_lenient_ID = int(sorted_df[j_name].head(1)) # \"hot mess\"\n",
+    "\n",
+    "    D_q = sorted_df[sorted_df[j_name] == most_lenient_ID]\n",
+    "\n",
+    "    R_q = D_q[D_q[t_name] == 1]\n",
+    "    \n",
+    "    R_sort_q = R_q.sort_values(by = s_name, ascending = False)\n",
+    "    \n",
+    "    number_to_remove = int(np.round((1 - r) * D_q.shape[0] - (D_q.shape[0] - R_q.shape[0])))\n",
+    "    print(number_to_remove)\n",
+    "    R_B = R_sort_q.head(number_to_remove)\n",
+    "    \n",
+    "    return 1 / D_q.shape[0] * np.sum(R_B[y_name] == 0) \n",
+    "\n",
+    "failure_rates = np.zeros(10)\n",
+    "\n",
+    "for r in range(10):\n",
+    "    failure_rates[r] = contraction(test, 'judgeID_J', 'decision_T',\n",
+    "                                   'result_Y', 'B_prob_0', 'acceptanceRate_R',  r / 10)\n",
+    "    \n",
+    "plt.plot(np.arange(0.1,1,.1), failure_rates[1:])\n"
    ]
   }
  ],
-- 
GitLab