Skip to content
Snippets Groups Projects
MultinomialNBXGBoost.ipynb 419 KiB
Newer Older
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C&gt;A</td>\n",
       "      <td>ACA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C&gt;A</td>\n",
       "      <td>ACC</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 9693 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  Mutation type Trinucleotide  ALL::TARGET-10-PAIXPH-03A-01D  \\\n",
       "0           C>A           ACA                              0   \n",
       "1           C>A           ACC                              0   \n",
       "\n",
       "   ALL::TARGET-10-PAKHZT-03A-01R  ALL::TARGET-10-PAKMVD-09A-01D  \\\n",
       "0                              0                              0   \n",
       "1                              0                              0   \n",
       "\n",
       "   ALL::TARGET-10-PAKSWW-03A-01D  ALL::TARGET-10-PALETF-03A-01D  \\\n",
       "0                              1                              0   \n",
       "1                              1                              0   \n",
       "\n",
       "   ALL::TARGET-10-PALLSD-09A-01D  ALL::TARGET-10-PAMDKS-03A-01D  \\\n",
       "0                              0                              0   \n",
       "1                              0                              0   \n",
       "\n",
       "   ALL::TARGET-10-PAPJIB-04A-01D  ...  Head-SCC::V-109  Head-SCC::V-112  \\\n",
       "0                              2  ...                0                0   \n",
       "1                              0  ...                1                0   \n",
       "\n",
       "   Head-SCC::V-116  Head-SCC::V-119  Head-SCC::V-123  Head-SCC::V-124  \\\n",
       "0                0                0                0                0   \n",
       "1                0                0                0                0   \n",
       "\n",
       "   Head-SCC::V-125  Head-SCC::V-14  Head-SCC::V-29  Head-SCC::V-98  \n",
       "0                0               0               0               1  \n",
       "1                0               1               0               0  \n",
       "\n",
       "[2 rows x 9693 columns]"
      ]
     },
     "execution_count": 418,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "other_wes_mut = pd.read_csv(\"./project_data/catalogs/WES/WES_Other.96.csv\")\n",
    "other_wes_mut.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 419,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cancer Types</th>\n",
       "      <th>Sample Names</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>SBS1</th>\n",
       "      <th>SBS2</th>\n",
       "      <th>SBS3</th>\n",
       "      <th>SBS4</th>\n",
       "      <th>SBS5</th>\n",
       "      <th>SBS6</th>\n",
       "      <th>SBS7a</th>\n",
       "      <th>...</th>\n",
       "      <th>SBS51</th>\n",
       "      <th>SBS52</th>\n",
       "      <th>SBS53</th>\n",
       "      <th>SBS54</th>\n",
       "      <th>SBS55</th>\n",
       "      <th>SBS56</th>\n",
       "      <th>SBS57</th>\n",
       "      <th>SBS58</th>\n",
       "      <th>SBS59</th>\n",
       "      <th>SBS60</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ALL</td>\n",
       "      <td>TARGET-10-PAIXPH-03A-01D</td>\n",
       "      <td>0.529</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ALL</td>\n",
       "      <td>TARGET-10-PAKHZT-03A-01R</td>\n",
       "      <td>0.696</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 68 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  Cancer Types              Sample Names  Accuracy  SBS1  SBS2  SBS3  SBS4  \\\n",
       "0          ALL  TARGET-10-PAIXPH-03A-01D     0.529     0     0     0     0   \n",
       "1          ALL  TARGET-10-PAKHZT-03A-01R     0.696     0     0     0     0   \n",
       "\n",
       "   SBS5  SBS6  SBS7a  ...  SBS51  SBS52  SBS53  SBS54  SBS55  SBS56  SBS57  \\\n",
       "0     0     0      0  ...      0      0      0      1      0      0      0   \n",
       "1     0     0      0  ...      0      0      0      1      0      0      0   \n",
       "\n",
       "   SBS58  SBS59  SBS60  \n",
       "0      0      0      0  \n",
       "1      0      0      0  \n",
       "\n",
       "[2 rows x 68 columns]"
      ]
     },
     "execution_count": 419,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "other_wes_act = pd.read_csv(\"./project_data/activities/WES/WES_Other.activities.csv\")\n",
    "other_wes_act.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imports and helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 420,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn\n",
    "from sklearn.decomposition import PCA\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "#import torch \n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "from sklearn.model_selection import cross_val_score, train_test_split, KFold\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n",
    "from sklearn.model_selection import learning_curve\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# These ones are work in progress\n",
    "def plot_roc_auc(X_tst, y_test, model, is_multi_class=False):\n",
    "    probs = model.predict_proba(X_tst)\n",
    "    probs = probs[:, 1]\n",
    "    if is_multi_class:\n",
    "        auc = roc_auc_score(y_test, probs, multi_class='ovo')\n",
    "    else:\n",
    "        auc = roc_auc_score(y_test, probs, multi_class='ovo')\n",
    "    \n",
    "    fp_rate, tp_rate, thresholds = roc_curve(y_test, probs)\n",
    "    \n",
    "    plt.figure(figsize=(7,6))\n",
    "    plt.axis('scaled')\n",
    "    plt.xlim([0,1])\n",
    "    plt.ylim([0,1])\n",
    "    plt.title(\"AUC & ROC\")\n",
    "    plt.plot(fp_rate, tp_rate, 'g')\n",
    "    plt.fill_between(fp_rate, tp_rate, facecolor = \"green\", alpha = 0.7)\n",
    "    plt.text(0.95, 0.05, f'AUC = {auc}', ha='right', fontsize=12, weight='bold', color='blue')\n",
    "    plt.xlabel(\"False Positive Rate\")\n",
    "    plt.ylabel(\"True Positive Rate\")\n",
    "\n",
    "def plot_confusion_mat(y_test, y_pred, labs=None, size=None):\n",
    "    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)\n",
    "    if size is None:\n",
    "        plt.figure(figsize=(12,10))\n",
    "    else:\n",
    "        plt.figure(figsize=size)\n",
    "    if labs is None:\n",
    "        sns.heatmap(cm, square=False, annot=True, fmt='d', cmap='viridis', cbar=True)\n",
    "    else:\n",
    "        sns.heatmap(cm, square=False, annot=True, fmt='d', cmap='viridis', xticklabels=labs, yticklabels=labs, cbar=True)\n",
    "    plt.xlabel('Predicted label')\n",
    "    plt.ylabel('True label')\n",
    "    #plt.ylim(0, 2)\n",
    "\n",
    "def plot_learning_curve(model, X, y):\n",
    "    N, train_lc, val_lc = learning_curve(model, X, y, cv=7, train_sizes=np.linspace(0.3, 1, 25))\n",
    "    plt.figure(figsize=(7,6))\n",
    "    plt.title(\"Learning curve\")\n",
    "    plt.plot(N, np.mean(train_lc, 1), color='blue', label='training score')\n",
    "    plt.plot(N, np.mean(val_lc, 1), color='red', label='validation score')\n",
    "    #plt.hlines(N, np.mean([train_lc[-1],  val_lc[-1]]), N[0], N[-1], color='gray', label='mean', linestyle='dashed')\n",
    "\n",
    "def plot_trn_tst_dist(y_all, y_train, y_test, y_pred, in_cols=False):\n",
    "    #fig = None\n",
    "    #ax = None\n",
    "    if in_cols:\n",
    "        fig, ax = plt.subplots(2,2)\n",
    "    else:\n",
    "        fig, ax = plt.subplots(4,1)\n",
    "\n",
    "    fig.set_size_inches(15,8)\n",
    "\n",
    "    plt_sets = [y_all, y_train, y_test, y_pred]\n",
    "    plt_labels = [\"All\", \"Train\", \"Test\", \"Pred\"]\n",
    "    plt_set_df = pd.DataFrame()\n",
    "    for i in range(len(plt_sets)):\n",
    "        s = pd.Series(plt_sets[i]).value_counts().sort_index()\n",
    "        plt_set_df[plt_labels[i]] = s\n",
    "    \n",
    "        pd.DataFrame({plt_labels[i]: s}).plot(ax=ax.flat[i], kind=\"bar\")\n",
    "        #sns.countplot(x=s, \n",
    "        #            palette=sns.hls_palette(2),\n",
    "        #            ax=ax[i])\n",
    "        ax.flat[i].tick_params(axis=\"x\", rotation=90)\n",
    "\n",
    "    fig.tight_layout()\n",
    "    with pd.option_context('display.max_rows', None,\n",
    "                       'display.max_columns', None,\n",
    "                       'display.precision', 2,\n",
    "                       ):\n",
    "        print(plt_set_df)\n",
    "\n",
    "\n",
    "   \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dataset preprocess, combine profile data to a single data frame\n",
    "\n",
    "From all profile sets, a combined data frame is made, which has samples in the rows and features in the columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 421,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Profile data:\n",
      "\n",
      "---Data set diagnostics print---\n",
      "\n",
      "Missing entries in mutations: 0\n",
      "The shape of the mutations data frame (20343, 97)\n",
      "Checking normalization: sum of some rows:\n",
      " cancer::TK74_LCIS2    1.0\n",
      "RCC::TCGA             1.0\n",
      "AdenoCA::TCGA         1.0\n",
      "AdenoCA::TCGA         1.0\n",
      "Melanoma::TCGA        1.0\n",
      "Tumor counts:\n",
      " AdenoCA      7712\n",
      "SCC          2188\n",
      "cancer       1639\n",
      "HCC          1318\n",
      "Melanoma     1231\n",
      "BNHL          822\n",
      "RCC           775\n",
      "GBM           605\n",
      "Medullo       557\n",
      "CA            462\n",
      "cell          389\n",
      "CMDI          357\n",
      "LGG           326\n",
      "CLL           302\n",
      "Papillary     297\n",
      "neoplasm      247\n",
      "ALL           240\n",
      "Ewings        231\n",
      "AML           230\n",
      "30            212\n",
      "bone          203\n",
      "Name: tumor_types, dtype: int64\n",
      "\n",
      "\n",
      "Tumor types with smallish counts: 7\n",
      "Papillary    297\n",
      "neoplasm     247\n",
      "ALL          240\n",
      "Ewings       231\n",
      "AML          230\n",
      "30           212\n",
      "bone         203\n",
      "Name: tumor_types, dtype: int64\n",
      "Unique tumor types:  21\n",
      "['30', 'ALL', 'AML', 'AdenoCA', 'BNHL', 'CA', 'CLL', 'CMDI', 'Ewings', 'GBM', 'HCC', 'LGG', 'Medullo', 'Melanoma', 'Papillary', 'RCC', 'SCC', 'bone', 'cancer', 'cell', 'neoplasm']\n"
     ]
    }
   ],
   "source": [
    "\n",
    "def prepare_mut_df(raw_mutation_dfs, is_profile, small_sample_limit=None):\n",
    "\n",
    "    mutations_all = pd.DataFrame()\n",
    "\n",
    "    for df in raw_mutation_dfs:\n",
    "        # Make a copy of the original data frame and start processing from there\n",
    "        mutations  = df.copy()\n",
    "    \n",
    "        if is_profile:\n",
    "            mutations['mut_tri'] = mutations.apply(lambda a: '{}_{}'.format(a['Mutation type'], a['Trinucleotide']), axis=1)\n",
    "            mutations = mutations.set_index('mut_tri').drop(['Mutation type', 'Trinucleotide'], axis=1)\n",
    "            mutations = mutations.T\n",
    "        else:\n",
    "            mutations['mut_tri'] = mutations.apply(lambda a: '{}::{}'.format(a['Cancer Types'], a['Sample Names']), axis=1)\n",
    "            mutations = mutations.set_index('mut_tri').drop(['Cancer Types', 'Sample Names', 'Accuracy'], axis=1)\n",
    "     \n",
    "        # Rename some index names\n",
    "        renamed_items = list(mutations.index)\n",
    "        index_items = list(mutations.index)\n",
    "\n",
    "        # Combine rows for low count labels\n",
    "        for i in range(len(index_items)):\n",
    "            result = index_items[i]\n",
    "            parts = result.split('-')\n",
    "            if len(parts) > 1:\n",
    "                result = parts[1]\n",
    "            else:\n",
    "                result = parts[0]\n",
    "            \n",
    "            #result = result.split('-')[0]\n",
    "            #for to_sub in ['Adeno', 'Bone', 'Breast', 'Cervix', 'CNS', 'Eye', 'Liver', 'Lymph', 'Lung', 'Kidney', 'Myeloid', 'Panc' ]:\n",
    "            #    result = re.sub( to_sub + r'(-\\w*)', to_sub, result)\n",
    "                \n",
    "            renamed_items[i] = result.replace('Ca', 'CA')\n",
    "       \n",
    "        mutations.rename(index=dict(zip(index_items, renamed_items)), inplace = True)\n",
    "   \n",
    "        # Normalize \n",
    "        row_sums = mutations.sum(axis=1)\n",
    "        mutations = mutations.divide(row_sums, axis = 0)\n",
    "\n",
    "        mutations_all = pd.concat([mutations_all, mutations])\n",
    "\n",
    "    mutations_all.sort_index(inplace=True)\n",
    "\n",
    "    # Do we need to renormalize after obtaining the full dataframe?\n",
    "  \n",
    "    # Figure out tumor types based on the first part of the index\n",
    "    tumor_types = [a.split(':')[0] for a in mutations_all.index]\n",
    "    \n",
    "    #print(\"ttt\", tumor_types)\n",
    "    mutations_all[\"tumor_types\"] = tumor_types\n",
    "\n",
    "    # Get rid of types with very few samples if the limit is specified\n",
    "    if small_sample_limit is not None:\n",
    "        mutations_all = cull_small_sample_counts(mutations_all, small_sample_limit)\n",
    "\n",
    "    tumor_types = mutations_all[\"tumor_types\"] \n",
    "    # Prepare a list with all the types appearing only once\n",
    "    unique_tumor_types = sorted(list(set(tumor_types)))\n",
    "    # Attach this back to the frame\n",
    "    \n",
    "    return (mutations_all, unique_tumor_types)\n",
    "\n",
    "def cull_small_sample_counts(mutations, small_sample_limit):\n",
    "    \n",
    "    counts = mutations[\"tumor_types\"].value_counts()\n",
    "    big_counts = counts[list(counts > small_sample_limit)]\n",
    "    big_index = mutations[\"tumor_types\"].isin(list(big_counts.index))\n",
    "    mutations = mutations[big_index]\n",
    "\n",
    "    return mutations\n",
    "\n",
    "def print_dset_diag(mut_df, unique_tumor_types, small_sample_limit):\n",
    "    # Check if the data frame is ok\n",
    "    print(\"\\n---Data set diagnostics print---\\n\")\n",
    "    print(\"Missing entries in mutations:\", mut_df.isnull().sum().sum())\n",
    "    print(\"The shape of the mutations data frame\", mut_df.shape)\n",
    "\n",
    "    # Check to see if the rows are normalized to one, take a sample from the data frame\n",
    "    norm_df = mut_df.sample(n=5, random_state=5)\n",
    "    print(\"Checking normalization: sum of some rows:\\n\", norm_df.iloc[:,0:-1].sum(axis=1))\n",
    "    print(\"\\n\")\n",
    "\n",
    "    # Check some counts of tumor types\n",
    "    tumor_counts = mut_df[\"tumor_types\"].value_counts() #.sort_values(ascending=True)\n",
    "    print(\"Tumor counts:\\n\", tumor_counts)\n",
    "    print(\"\\n\")\n",
    "\n",
    "    small_counts = tumor_counts < 1.5*small_sample_limit\n",
    "    print(\"Tumor types with smallish counts:\",  sum(small_counts))\n",
    "\n",
    "    print(tumor_counts[small_counts])\n",
    "    print(\"\\n\")\n",
    "\n",
    "    # Tumor types\n",
    "    print(\"Unique tumor types: \", len(unique_tumor_types))\n",
    "    print(unique_tumor_types)\n",
    "\n",
    "\n",
    "small_sample_limit = 250\n",
    "\n",
    "profile_raw_data_sets = [PCAWG_wgs_mut, TCGA_wes_mut, nonPCAWG_wgs_mut, other_wes_mut]\n",
    "profile_mut_all, prf_unique_tumor_types = prepare_mut_df(profile_raw_data_sets, True, small_sample_limit)\n",
    "\n",
    "# Print some diagnostics from the prepared data set\n",
    "print(\"Profile data:\")\n",
    "print_dset_diag(profile_mut_all, prf_unique_tumor_types, small_sample_limit)\n",
    "\n",
    "# Data matrix X for fitting, omit the tumor labeling from there, use that information in constructing true y\n",
    "# Note: this contains profile data only\n",
    "#X_prf = profile_mut_all.drop(\"tumor_types\", axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dataset preprocess for activites data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 422,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Activities data:\n",
      "\n",
      "---Data set diagnostics print---\n",
      "\n",
      "Missing entries in mutations: 0\n",
      "The shape of the mutations data frame (20343, 66)\n",
      "Checking normalization: sum of some rows:\n",
      " mut_tri\n",
      "cancer::TK74_LCIS2    1.0\n",
      "RCC::TCGA             1.0\n",
      "AdenoCA::TCGA         1.0\n",
      "AdenoCA::TCGA         1.0\n",
      "Melanoma::TCGA        1.0\n",
      "Tumor counts:\n",
      " AdenoCA      7712\n",
      "SCC          2188\n",
      "cancer       1639\n",
      "HCC          1318\n",
      "Melanoma     1231\n",
      "BNHL          822\n",
      "RCC           775\n",
      "GBM           605\n",
      "Medullo       557\n",
      "CA            462\n",
      "cell          389\n",
      "CMDI          357\n",
      "LGG           326\n",
      "CLL           302\n",
      "Papillary     297\n",
      "neoplasm      247\n",
      "ALL           240\n",
      "Ewings        231\n",
      "AML           230\n",
      "30            212\n",
      "bone          203\n",
      "Name: tumor_types, dtype: int64\n",
      "\n",
      "\n",
      "Tumor types with smallish counts: 7\n",
      "Papillary    297\n",
      "neoplasm     247\n",
      "ALL          240\n",
      "Ewings       231\n",
      "AML          230\n",
      "30           212\n",
      "bone         203\n",
      "Name: tumor_types, dtype: int64\n",
      "Unique tumor types:  21\n",
      "['30', 'ALL', 'AML', 'AdenoCA', 'BNHL', 'CA', 'CLL', 'CMDI', 'Ewings', 'GBM', 'HCC', 'LGG', 'Medullo', 'Melanoma', 'Papillary', 'RCC', 'SCC', 'bone', 'cancer', 'cell', 'neoplasm']\n"
     ]
    }
   ],
   "source": [
    "act_raw_data_sets = [PCAWG_wgs_act, TCGA_wes_act, nonPCAWG_wgs_act, other_wes_act]\n",
    "act_mut_all, act_unique_tumor_types = prepare_mut_df(act_raw_data_sets, is_profile=False, small_sample_limit=small_sample_limit)\n",
    "\n",
    "# Print some diagnostics from the prepared data set\n",
    "print(\"Activities data:\")\n",
    "print_dset_diag(act_mut_all, act_unique_tumor_types, small_sample_limit)\n",
    "\n",
    "# Data matrix X for fitting, omit the tumor labeling from there, use that information in constructing true y\n",
    "# Note: this contains profile data only\n",
    "X_act = act_mut_all.drop(\"tumor_types\", axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check profile data content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 423,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Some content from the full profile set:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>mut_tri</th>\n",
       "      <th>C&gt;A_ACA</th>\n",
       "      <th>C&gt;A_ACC</th>\n",
       "      <th>C&gt;A_ACG</th>\n",
       "      <th>C&gt;A_ACT</th>\n",
       "      <th>C&gt;A_CCA</th>\n",
       "      <th>C&gt;A_CCC</th>\n",
       "      <th>C&gt;A_CCG</th>\n",
       "      <th>C&gt;A_CCT</th>\n",
       "      <th>C&gt;A_GCA</th>\n",
       "      <th>C&gt;A_GCC</th>\n",
       "      <th>...</th>\n",
       "      <th>T&gt;G_CTT</th>\n",
       "      <th>T&gt;G_GTA</th>\n",
       "      <th>T&gt;G_GTC</th>\n",
       "      <th>T&gt;G_GTG</th>\n",
       "      <th>T&gt;G_GTT</th>\n",
       "      <th>T&gt;G_TTA</th>\n",
       "      <th>T&gt;G_TTC</th>\n",
       "      <th>T&gt;G_TTG</th>\n",
       "      <th>T&gt;G_TTT</th>\n",
       "      <th>tumor_types</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>30</td>\n",
       "      <th>30</th>\n",
       "      <td>0.040000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.08</td>\n",
       "      <td>0.020000</td>\n",
       "      <td>0.020000</td>\n",
       "      <td>0.040000</td>\n",
       "      <td>0.140000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.04</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30</td>\n",
       "      <th>30</th>\n",
       "      <td>0.153846</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.076923</td>\n",
       "      <td>0.076923</td>\n",
       "      <td>0.076923</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>0.032258</td>\n",
       "      <td>0.032258</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.032258</td>\n",
       "      <td>0.032258</td>\n",
       "      <td>0.032258</td>\n",
       "      <td>0.032258</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30</td>\n",
       "      <th>30</th>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 97 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "mut_tri   C>A_ACA   C>A_ACC   C>A_ACG  C>A_ACT  C>A_CCA   C>A_CCC   C>A_CCG  \\\n",
       "30       0.000000  0.000000  0.000000     0.00     0.00  0.000000  0.000000   \n",
       "30       0.040000  0.000000  0.000000     0.02     0.08  0.020000  0.020000   \n",
       "30       0.153846  0.000000  0.000000     0.00     0.00  0.076923  0.000000   \n",
       "30       0.000000  0.032258  0.032258     0.00     0.00  0.000000  0.032258   \n",
       "30       0.100000  0.100000  0.000000     0.00     0.00  0.000000  0.000000   \n",
       "\n",
       "mut_tri   C>A_CCT   C>A_GCA   C>A_GCC  ...  T>G_CTT  T>G_GTA  T>G_GTC  \\\n",
       "30       0.100000  0.100000  0.000000  ...     0.00      0.0      0.0   \n",
       "30       0.040000  0.140000  0.000000  ...     0.04      0.0      0.0   \n",
       "30       0.076923  0.076923  0.000000  ...     0.00      0.0      0.0   \n",
       "30       0.032258  0.032258  0.032258  ...     0.00      0.0      0.0   \n",
       "30       0.000000  0.000000  0.000000  ...     0.00      0.0      0.0   \n",
       "\n",
       "mut_tri  T>G_GTG  T>G_GTT  T>G_TTA  T>G_TTC  T>G_TTG  T>G_TTT  tumor_types  \n",
       "30           0.1      0.0      0.0      0.0     0.00      0.0           30  \n",
       "30           0.0      0.0      0.0      0.0     0.02      0.0           30  \n",
       "30           0.0      0.0      0.0      0.0     0.00      0.0           30  \n",
       "30           0.0      0.0      0.0      0.0     0.00      0.0           30  \n",
       "30           0.0      0.0      0.0      0.0     0.00      0.0           30  \n",
     "execution_count": 423,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Some content from the full profile set:\")\n",
    "profile_mut_all.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 424,
      "image/png": "iVBORw0KGgoAAAANSUhEUgAABaYAAAFhCAYAAACLRX2NAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAAsTAAALEwEAmpwYAABNnElEQVR4nO3de1iUdf7/8dcA4ikOgaiUlmWmWF/TctMs3VYry1BJt6VAW9vS/ZWla1qipZBmhYdKS7OTtpZbGyGo1K61spntmnZYyxZJRUzdEBU0UZHDML8/5mJG8kSA92ec+/m4rr2+MDfEe15fHGZec9+fj8PlcrkEAAAAAAAAAIBFAkwPAAAAAAAAAACwF4ppAAAAAAAAAIClKKYBAAAAAAAAAJaimAYAAAAAAAAAWIpiGgAAAAAAAABgKYppAAAAAAAAAIClalVM//Of/1RcXJwGDx6sgQMH6qOPPpIk5efnKz4+Xv3791d8fLx27Njh+Z66HgMAAAAAAAAA+DeHy+Vyne4LXC6Xrr32Wi1dulSXX365cnNzdffdd+urr77SiBEjNHToUA0ePFjLly9Xenq6lixZIkm655576nSstg4cOKKqqtOOflZFRp6noqLDxn6+ryAHL7JwIwcvsnAjBzdy8CILN3LwIgs3cvAiCzdycCMHL7JwIwcvsnAjBy+ycCMHN1/IISDAofPPb37SY0G1+w8EqKSkRJJUUlKili1b6sCBA8rJydHixYslSbGxsZo+fbqKi4vlcrnqdCwiIqLWd6qqymW0mK6eAeRwPLJwIwcvsnAjBzdy8CILN3LwIgs3cvAiCzdycCMHL7JwIwcvsnAjBy+ycCMHN1/O4YzFtMPh0AsvvKAHH3xQzZo105EjR/TKK6+ooKBArVq1UmBgoCQpMDBQLVu2VEFBgVwuV52O/ZJiOjLyvLrc3wYVFRViegSfQA5eZOFGDl5k4UYObuTgRRZu5OBFFm7k4EUWbuTgRg5eZOFGDl5k4UYOXmThRg5uvpzDGYvpyspKvfLKK1qwYIGuueYaffXVVxo3bpxmzpxpxXynVFR02GjjHxUVon37Soz9fF9BDl5k4UYOXmThRg5u5OBFFm7k4EUWbuTgRRZu5OBGDl5k4UYOXmThRg5eZOFGDm6+kENAgOOUJxifsZjevHmz9u7dq2uuuUaSdM0116hp06Zq3LixCgsL5XQ6FRgYKKfTqb179yo6Oloul6tOxwAAAAAAAAAA/i/gTF/QunVr7dmzR9u3b5ck5eXlaf/+/br44osVExOjrKwsSVJWVpZiYmIUERGhyMjIOh0DAAAAAAAAAPi/M54xHRUVpZSUFI0dO1YOh0OS9Mwzzyg8PFwpKSlKSkrSggULFBoaqtTUVM/31fUYAAAAAAAAAMC/nbGYlqRBgwZp0KBBJ9zevn17paWlnfR76noMAAAAAAAAAODfzriUBwAAAAAAAAAADYliGgAAAAAAAABgKYppAAAAAAAAAIClKKYBAAAAAAAAAJaq1eaHAOwjJLSpmjSu30NDVFRIvb7/WFmlSg6V1uu/AQAAAAAAAN9FMQ2ghiaNgzRw/HKjM6ycM1glRicAAAAAAADA2cRSHgAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALBU0Jm+YPfu3Ro9erTn85KSEh0+fFgbNmxQfn6+kpKSdPDgQYWHhys1NVXt2rWTpDofAwAAAAAAAAD4tzOeMd2mTRstX77c879+/fopNjZWkpScnKyEhAStWrVKCQkJmjp1quf76noMAAAAAAAAAODfftFSHuXl5Vq5cqWGDh2qoqIi5eTkeErq2NhY5eTkqLi4uM7HAAAAAAAAAAD+74xLeRwvOztbrVq10hVXXKHvvvtOrVq1UmBgoCQpMDBQLVu2VEFBgVwuV52ORURE1HqWyMjzfsnoZ0VUVIjpEXwCOXiRRcPxlyz95X7UFzm4kYMXWbiRgxdZuJGDF1m4kYMbOXiRhRs5eJGFGzl4kYUbObj5cg6/qJhOT0/X0KFDz9Ysv0hR0WFVVbmM/fyoqBDt21di7Of7CnLw8pcsfOUBy1+y9If7UV/k4EYOXmThRg5eZOFGDl5k4UYObuTgRRZu5OBFFm7k4EUWbuTg5gs5BAQ4TnmCca2X8igsLNQXX3yhgQMHSpKio6NVWFgop9MpSXI6ndq7d6+io6PrfAwAAAAAAAAA4P9qXUxnZGTo17/+tc4//3xJUmRkpGJiYpSVlSVJysrKUkxMjCIiIup8DAAAAAAAAADg/2q9lEdGRoYef/zxGrelpKQoKSlJCxYsUGhoqFJTU+t9DAAAAAAAAADg32pdTK9ateqE29q3b6+0tLSTfn1djwEAAAAAAAAA/Futl/IAAAAAAAAAAKAhUEwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUrUqpsvKypScnKxbbrlFAwcO1JQpUyRJ+fn5io+PV//+/RUfH68dO3Z4vqeuxwAAAAAAAAAA/q1WxfSsWbPUuHFjrVq1SitXrtTYsWMlScnJyUpISNCqVauUkJCgqVOner6nrscAAAAAAAAAAP7tjMX0kSNHlJmZqbFjx8rhcEiSWrRooaKiIuXk5Cg2NlaSFBsbq5ycHBUXF9f5GAAAAAAAAADA/wWd6Qt27dql8PBwvfTSS1q/fr2aN2+usWPHqkmTJmrVqpUCAwMlSYGBgWrZsqUKCgrkcrnqdCwiIqLWg0dGnleX+9ugoqJCTI/gE8jBiywajr9k6S/3o77IwY0cvMjCjRy8yMKNHLzIwo0c3MjBiyzcyMGLLNzIwYss3MjBzZdzOGMxXVlZqV27dqlz586aOHGivvnmG/2///f/NHfuXCvmO6WiosOqqnIZ+/lRUSHat6/E2M/3FeTg5S9Z+MoDlr9k6Q/3o77IwY0cvMjCjRy8yMKNHLzIwo0c3MjBiyzcyMGLLNzIwYss3MjBzRdyCAhwnPIE4zMW0xdccIGCgoI8S29cddVVOv/889WkSRMVFhbK6XQqMDBQTqdTe/fuVXR0tFwuV52OAQAAAAAAAAD83xnXmI6IiFCPHj30r3/9S5KUn5+voqIitWvXTjExMcrKypIkZWVlKSYmRhEREYqMjKzTMQAAAAAAAACA/zvjGdOS9OSTT2ry5MlKTU1VUFCQZs6cqdDQUKWkpCgpKUkLFixQaGioUlNTPd9T12MAAAAAAAAAAP9Wq2K6bdu2euutt064vX379kpLSzvp99T1GAAAAAAAAADAv51xKQ8AAAAAAAAAABoSxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALAUxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAsRTENAAAAAAAAALBUUG2+qG/fvgoODlbjxo0lSRMmTFDv3r2Vn5+vpKQkHTx4UOHh4UpNTVW7du0kqc7HAAAAAAAAAAD+rdZnTM+bN0/Lly/X8uXL1bt3b0lScnKyEhIStGrVKiUkJGjq1Kmer6/rMQAAAAAAAACAf6vzUh5FRUXKyclRbGysJCk2NlY5OTkqLi6u8zEAAAAAAAAAgP+r1VIeknv5DpfLpWuuuUaPPPKICgoK1KpVKwUGBkqSAgMD1bJlSxUUFMjlctXpWERERK0Hj4w875fcz7MiKirE9Ag+gRy8yKLh+EuW/nI/6osc3MjBiyzcyMGLLNzIwYss3MjBjRy8yMKNHLzIwo0cvMjCjRzcfDmHWhXTS5cuVXR0tMrLyzVjxgxNmzZNI0aMOMujnV5R0WFVVbmM/fyoqBDt21di7Of7CnLw8pcsfOUBy1+y9If7UV/k4EYOXmThRg5eZOFGDl5k4UYObuTgRRZu5OBFFm7k4EUWbuTg5gs5BAQ4TnmCca2W8oiOjpYkBQcHKyEhQV9//bWio6NVWFgop9MpSXI6ndq7d6+io6PrfAwAAAAAAAAA4P/OWEwfPXpUJSXuZt3lcunDDz9UTEyMIiMjFRMTo6ysLElSVlaWYmJiFBERUedjAAAAAAAAAAD/d8alPIqKivTwww/L6XSqqqpK7du3V3JysiQpJSVFSUlJWrBggUJDQ5Wamur5vroeAwAAAAAAAAD4tzMW023btlVmZuZJj7Vv315paWkNegwAAAAAAAAA4N9qtcY0AAAAAAAAAAANhWIaAAAAAAAAAGApimkAAAAAAAAAgKUopgEAAAAAAAAAlqKYBgAAAAAAAABYimIaAAAAAAAAAGApimkAAAAAAAAAgKUopgEAAAAAAAAAlqKYBgAAAAAAAABYimIaAAAAAAAAAGApimkAAAAAAAAAgKUopgEAAAAAAAAAlqKYBgAAAAAAAABYimIaAAAAAAAAAGApimkAAAAAAAAAgKUopgEAAAAAAAAAlqKYBgAAAAAAAABYimIaAAAAAAAAAGApimkAAAAAAAAAgKUopgEAAAAAAAAAlqKYBgAAAAAAAABYimIaAAAAAAAAAGApimkAAAAAAAAAgKUopgEAAAAAAAAAlvpFxfRLL72kjh07asuWLZKk/Px8xcfHq3///oqPj9eOHTs8X1vXYwAAAAAAAAAA/1brYvq///2vNm7cqAsuuMBzW3JyshISErRq1SolJCRo6tSp9T4GAAAAAAAAAPBvtSqmy8vLNW3aNCUnJ8vhcEiSioqKlJOTo9jYWElSbGyscnJyVFxcXOdjAAAAAAAAAAD/F1SbL5o7d64GDRqktm3bem4rKChQq1atFBgYKEkKDAxUy5YtVVBQIJfLVadjERERDX3/AAAAAAAAAAA+5ozF9H/+8x9t2rRJEyZMsGKeWouMPM/0CIqKCjE9gk8gBy+yaDj+kqW/3I/6Igc3cvAiCzdy8CILN3LwIgs3cnAjBy+ycCMHL7JwIwcvsnAjBzdfzuGMxfQXX3yh7du3q1+/fpKkPXv26L777tOkSZNUWFgop9OpwMBAOZ1O7d27V9HR0XK5XHU69ksUFR1WVZWrbve6AURFhWjfvhJjP99XkIOXv2ThKw9Y/pKlP9yP+iIHN3LwIgs3cvAiCzdy8CILN3JwIwcvsnAjBy+ycCMHL7JwIwc3X8ghIMBxyhOMz7jG9KhRo/TZZ58pOztb2dnZat26td544w0NGDBAMTExysrKkiRlZWUpJiZGERERioyMrNMxAAAAAAAAAID/q9Ua06eSkpKipKQkLViwQKGhoUpNTa33MQAAAAAAAACAf/vFxXR2drbn4/bt2ystLe2kX1fXYwAAAAAAAAAA/3bGpTwAAAAAAAAAAGhIFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwFMU0AAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwFMU0AAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwFMU0AAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwFMU0AAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwFMU0AAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwFMU0AAAAAAAAAMBSQbX5ogcffFC7d+9WQECAmjVrpilTpigmJkb5+flKSkrSwYMHFR4ertTUVLVr106S6nwMAAAAAAAAAODfanXGdGpqqlasWKHMzEz94Q9/0OTJkyVJycnJSkhI0KpVq5SQkKCpU6d6vqeuxwAAAAAAAAAA/q1WxXRISIjn48OHD8vhcKioqEg5OTmKjY2VJMXGxionJ0fFxcV1PgYAAAAAAAAA8H+1WspDkh5//HH961//ksvl0uuvv66CggK1atVKgYGBkqTAwEC1bNlSBQUFcrlcdToWERFR68EjI8/7JffzrIiKCjnzF9kAOXiRRcPxlyz95X7UFzm4kYMXWbiRgxdZuJGDF1m4kYMbOXiRhRs5eJGFGzl4kYUbObj5cg61LqZnzJghScrMzNTMmTM1duzYszZUbRQVHVZVlcvYz4+KCtG+fSXGfr6vIAcvf8nCVx6w/CVLf7gf9UUObuTgRRZu5OBFFm7k4EUWbuTgRg5eZOFGDl5k4UYOXmThRg5uvpBDQIDjlCcY12opj+PFxcVp/fr1at26tQoLC+V0OiVJTqdTe/fuVXR0tKKjo+t0DAAAAAAAAADg/85YTB85ckQFBQWez7OzsxUWFqbIyEjFxMQoKytLkpSVlaWYmBhFRETU+RgAAAAAAAAAwP+dcSmP0tJSjR07VqWlpQoICFBYWJgWLlwoh8OhlJQUJSUlacGCBQoNDVVqaqrn++p6DAAAAAAAAADg385YTLdo0ULvvffeSY+1b99eaWlpDXoMAAAAAAAAAODffvEa0wAAAAAAAAAA1AfFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsBTFNAAAAAAAAADAUhTTAAAAAAAAAABLUUwDAAAAAAAAACxFMQ0AAAAAAAAAsFSQ6QEAAAAAAADsJiS0qZo0rn8tExUVUufvPVZWqZJDpfWeAQDqgmIaAAAAAADAYk0aB2ng+OVGZ1g5Z7BKjE4AwM5YygMAAAAAAAAAYCmKaQAAAAAAAACApc5YTB84cEAjR45U//79NXDgQD300EMqLi6WJOXn5ys+Pl79+/dXfHy8duzY4fm+uh4DAAAAAAAAAPi3MxbTDodD999/v1atWqWVK1eqbdu2mj17tiQpOTlZCQkJWrVqlRISEjR16lTP99X1GAAAAAAAAADAv52xmA4PD1ePHj08n3ft2lU//vijioqKlJOTo9jYWElSbGyscnJyVFxcXOdjAAAAAAAAAAD/F/RLvriqqkrvvPOO+vbtq4KCArVq1UqBgYGSpMDAQLVs2VIFBQVyuVx1OhYREVHrWSIjz/slo58VUVEhpkfwCeTgRRYNx1+y9Jf7UV/k4EYOXmThRg5eZOFGDl5k4UYObuTgRRZu5NBw/CVLf7kfDYEs3MjBzZdz+EXF9PTp09WsWTMNGzZMOTk5Z2umWikqOqyqKpexnx8VFaJ9+0qM/XxfQQ5e/pKFrzxg+UuW/nA/6osc3MjBiyzcyMGLLNzIwYss3MjBjRy8yMLNX3LgtVfD8ZffiYZAFm7k4OYLOQQEOE55gnGti+nU1FT98MMPWrhwoQICAhQdHa3CwkI5nU4FBgbK6XRq7969io6OlsvlqtMxAAAAAAAAAID/O+Ma05L0/PPP67vvvtP8+fMVHBwsSYqMjFRMTIyysrIkSVlZWYqJiVFERESdjwEAAAAAAAAA/N8Zz5jeunWrFi5cqHbt2umuu+6SJLVp00bz589XSkqKkpKStGDBAoWGhio1NdXzfXU9BgAAAAAAAADwb2cspjt06KDvv//+pMfat2+vtLS0Bj0GAAAAAAAAAPBvtVrKAwAAAAAAAACAhkIxDQAAAAAAAACwFMU0AAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwVJDpAQAAAAAAAAC7CwltqiaN61/VRUWF1Pl7j5VVquRQab1nAGqDYhoAAAAAAAAwrEnjIA0cv9zoDCvnDFaJ0QlgJyzlAQAAAAAAAACwFMU0AAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwFMU0AAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALEUxDQAAAAAAAACwVJDpAQAAAAB/EBLaVE0a1+/pdVRUSL2+/1hZpUoOldbrvwEAAABYgWIaAAAAaABNGgdp4PjlRmdYOWewSoxOAAAAANQOxTQAAAAA4KzhagIAAHAyFNMAAAAAgLOGqwkAAMDJsPkhAAAAAAAAAMBSFNMAAAAAAAAAAEtRTAMAAAAAAAAALHXGYjo1NVV9+/ZVx44dtWXLFs/t+fn5io+PV//+/RUfH68dO3bU+xgAAAAAAAAAwP+dsZju16+fli5dqgsvvLDG7cnJyUpISNCqVauUkJCgqVOn1vsYAAAAAAAAAMD/nbGY7t69u6Kjo2vcVlRUpJycHMXGxkqSYmNjlZOTo+Li4jofAwAAAAAAAADYQ1BdvqmgoECtWrVSYGCgJCkwMFAtW7ZUQUGBXC5XnY5FRET8ohkiI8+ry+gNKioqxPQIPoEcvMii4fhLlv5yP+qLHNzIwYss3MjBiywajr9k6S/3o77IoeH4S5b+cj/qixwajr9k6S/3wxf4S5b+cj/qy5dzqFMx7QuKig6rqspl7OdHRYVo374SYz/fV5CDl79k4SsPWP6SpT/cj/oiBzdy8CILN3Lw8pcs+BvacPzld6K+/CUH/m00HH/5nagvf8mBfxsNh9+JhuUvWfrD/agvX8ghIMBxyhOM61RMR0dHq7CwUE6nU4GBgXI6ndq7d6+io6PlcrnqdAwAAAAAAAAAYA9nXGP6ZCIjIxUTE6OsrCxJUlZWlmJiYhQREVHnYwAAAAAAAAAAezjjGdNPPfWUPvroI+3fv1/33nuvwsPD9cEHHyglJUVJSUlasGCBQkNDlZqa6vmeuh4DAAAAAAAAAPi/MxbTTzzxhJ544okTbm/fvr3S0tJO+j11PQYAAAAAAAAA8H/n7OaHAHA2hYQ2VZPG9X+IrM/mFcfKKlVyqLTeMwAAAAAAAPgaimkAOIkmjYM0cPxyozOsnDNY7CEMAAAAAAD8EcU0AAAAAACwTENcnVifKxMlrk4EAF9AMQ0AAAAAACzD1YkAAEkKMD0AAAAAAAAAAMBeKKYBAAAAAAAAAJaimAYAAAAAAAAAWIo1pgEAAFBnDbGBlVS/TazYwAoAAAA491BMAwAAoM7YwAoAAABAXVBMAwAAAEAD84WrCSSuKAAAAL6LYhoAAAAAGpgvXE0gcUUBAADwXWx+CAAAAAAAAACwFGdMAwBOicuQAQAAAADA2UAxDQA4JS5DBgAAAAAAZwNLeQAAAAAAAAAALMUZ0wAAAAAAAAB8AktK2gfFNAAAqDWeJAIAAAA4m1hS0j4opgEAQK3xJBHAmfjCG1i8eQUAAOD7KKYBAKiFhihaOEsYgB34whtYvHkFX+QLb9pIPJ8AAPgOimkAAGqBogUAANSHLzyXkHg+AQDwHRTTNscZgG6+cPaCL+QAAKg9/oYCAAAAOJv8/TWHLYtpXyghJd94MekL79r7wjv25AAA+KX42wEAANAw/L18A+rK319z2LKY9oX/p0q8mAQAAAAAAPCFnoaOBrBegOkBAAAAAAAAAAD2QjENAAAAAAAAALCUsWI6Pz9f8fHx6t+/v+Lj47Vjxw5TowAAAAAAAAAALGSsmE5OTlZCQoJWrVqlhIQETZ061dQoAAAAAAAAAAALGdn8sKioSDk5OVq8eLEkKTY2VtOnT1dxcbEiIiJq9d8ICHDUa4aW5zet1/c3lPrej4bgC1mQg5sv5CCRRTVycPOFHCSyqEYOXmThRg5uvpCDRBbVyMHNF3KQyKIaOXiRhRs5uPlCDhJZVCMHN1/IQSKLavXJ4XTf63C5XK46/5fr6LvvvtPEiRP1wQcfeG4bMGCAZs2apSuuuMLqcQAAAAAAAAAAFmLzQwAAAAAAAACApYwU09HR0SosLJTT6ZQkOZ1O7d27V9HR0SbGAQAAAAAAAABYyEgxHRkZqZiYGGVlZUmSsrKyFBMTU+v1pQEAAAAAAAAA5y4ja0xLUl5enpKSknTo0CGFhoYqNTVVl156qYlRAAAAAAAAAAAWMlZMAwAAAAAAAADsic0PAQAAAAAAAACWopgGAAAAAAAAAFiKYhoAAAAAAAAAYCmKaQAAAAAAAACApSimAQAAAAAAAACWopgGGlBJSYnpEQAAAAAAAACfRzH9Cxw4cECbN2/W5s2bdeDAAdPj+IxJkyaZHsFnDBw40PQIRpWXl2vFihUaMWKE6VF8ypYtW0yPYFRhYaFefvll3XLLLaZH8Ql/+MMfTI8AAACAc9Dq1au1fPnyE25PS0tTdna2gYlgmtPp1F//+lfTYxi3bdu20/7PrsrLy1VaWur5ny8KMj3AuWDnzp2aMmWKcnJy1LJlS0nS3r171blzZz355JNq166d2QENW7dunekRfIbL5TI9ghGbNm3S+++/r7///e/6v//7P8XFxZkeyaeMGjVKn3zyiekxLFVRUaHVq1fr/fff14YNG3THHXfo6aefNj2WT9i+fbvpESw1dOhQORyOUx5///33LZzGtxQVFWnXrl3q2rWr6VEss3Tp0tMeT0xMtGgS85xOp8rLy9W0adMat5eWlio4OFiBgYGGJrOey+VSeXm5GjduLEnauHGjKioqJEmdO3dW8+bNTY5nxNq1a/Xvf/9bDodDvXr10g033GB6JBg0depUDRs2TJdffrnpUWDYa6+9ppdeeumE23/9619rzJgx6tu3r4GpzNmxY4cmTZqkwsJCZWdn67///a+ys7P18MMPmx7NMoGBgVq+fLni4+NNj2LUqFGjTnnM4XBo9erVFk5j3qpVq/T0009r7969ktzPtRwOhzZv3mx4shNRTNfCY489poSEBC1evFgBAe6TzKuqqrRy5UpNnDjR9u9O2bWMPZnTlS/+5sCBA1q+fLnS09NVUVGhuLg4NW3aVK+//rrp0XyOnf6N5ObmKj09XVlZWercubPi4uKUl5enJ5980vRoMGTixImmR/ApCQkJeuWVV+RyuRQXF6fQ0FD16dPHNjlNnz5dV155pTp06GB6FONmz56tSy+9VHfeeWeN29PS0rRnzx499thjhiaz3gsvvKDS0lJNnjxZkvSnP/1JkZGRKi8v1y233GKrgkFyF0/Lly/X7bffLklKTU3V999/r/vuu8/wZGZ89tln2rx5s8rKyjy3PfTQQwYnst4ll1yihx9+WC1atFBiYqJuueUWBQXZ56V8WlqaXC6Xfve739W4fcmSJWrevLmGDh1qaDLrHTlyRC1atDjh9pYtW+ro0aMGJjIrJSVFDzzwgObMmSNJiomJ0WOPPWa7vxu9evXS3//+d916662mRzGGKwZqmjlzpl588UVdeeWVnh7TV9nnr1k9HDx4UIMGDapxW0BAgAYPHqyXX37Z0FS+w05lrKTTXgZSWVlp4SRm9e7dW927d9eTTz6pq6++WpL7SSNOZKd/I3Fxcbruuuu0bNkyRUdHS3IXDrCva6+91vQIPuXo0aMKCQnR8uXLNXDgQE2YMEGDBw+2TTE9Y8YMZWZmatu2bYqLi1NsbKzCwsJMj2XEp59+qgkTJpxwe2JiogYPHmyrYnrNmjU1zqYPDw9Xenq6nE6nhg0bZruCYcWKFXr33Xd13nnnSZKGDx+uu+++25bF9OzZs7Vp0yZt27ZN/fr10+rVq3XdddeZHsty9957r+699159+umn+stf/qJnn31Wv/3tbxUfH69WrVqZHu+se+edd7RkyZITbo+Li9OIESNsVUwfO3bslMd89TL9s6mkpER9+vTRc889J8nd0zRq1MjwVNZ7++23dfDgQTVp0kRNmzb1nB1rp6vbz/T7//Mr1PxdVFSUunTpYnqMWqGYroXw8HBlZWXp9ttv9xRMLpdLK1euVGhoqOHprHGqS7FdLpeKiooMTGTO6S4Rqb4E1Q7uuecerVy5Us8995yGDh2q/v37mx7JqDVr1pzy2PFn+Pi7KVOmaNmyZUpMTNSQIUM0ePBg0yMZ0bNnz1M+Ztpxk9SPP/5Yr776queNvQ4dOmjUqFG66aabDE9mvfLycknS+vXrNWDAAAUEBNhqyYahQ4dq6NCh2r17tzIyMnT33Xfr8ssv1wMPPKCOHTuaHs9Sp/r/fWBgoK3e0JTcWRy/XMeAAQMkubOoXtLDbqpL6Z9/bDdr1qxRRkaGhgwZomnTpmn06NG2vgqra9euysvLU25urjZu3Kj3339ff/jDH/x+f5eKioqT/jsIDQ211YlBktSxY0etXLnyhL2NPvjgA1tejVT9d6L672ZhYaHPnx16NqSnp5sewbhu3brJ4XCc9GplX13C4mwaPny4XnjhBd188801eqrLLrvM4FQnRzFdC88++6ySk5M1bdo0tWrVSi6XS4WFhYqJidGzzz5rejxL2OVMrtrgEhG3xx57TOPHj9eaNWuUnp6up59+Wk6nU59//rl69uxpejzLnW4JE1988D9bEhMTlZiYqO+//17p6emKj4/XkSNHlJ6erv79+9vmxTVPDr3WrFmjJ598Ug899JCuuuoquVwuffvtt0pJSVGjRo3061//2vSIlrr22mvVv39/uVwupaSk6NChQ7Z8AdWmTRuNGDFCLVq00Lx583T99dfbrpiu3ozm52fwHDlyxPMGhl0cOnSoxufHnwRgxw3Hr7zySk2aNEl33nmnHA6H0tLSdOWVV5oey4jg4GAFBQXJ4XCooqJCrVq10p49e0yPZbnvvvtOS5cu1b/+9S/Fxsbq7bffVps2bXT48GHFxsb6fTF9ujMhjxw5YuEk5o0fP17Dhw/XJ598oquuukqS9M0332j9+vV66623DE9nvYSEBD300EM6cOCAXnzxRWVmZmrcuHGmx7LchRdeqMOHD+uHH37QFVdcYXocI3Jzc02P4FMKCwv15ptvKjMz0/Naw1fX2na47LT4aT0VFxeroKBAkhQdHa2IiAjDE/mGefPmacyYMabHgGHFxcXKyMhQRkaGDh06pE8//dT0SDDg55t5VW+C+M477+jbb7/Vf/7zH8MTmjdp0iQ988wzpsewzH333ac//vGPJyzp8eWXX2rBggVatGiRocnMcLlcys3NVdu2bXXeeeepuLhYe/bsUefOnU2PZgmXy6W1a9dq2bJl2rJli2677TbFxcWpbdu2pkez3Ny5c5WXl6enn37a86ZdSUmJpk6dqosuushWL6yrN+z6+ebJK1as0Mcff6wXX3zRzGCGHD16VPPnz/dcgt2rVy89+OCDatasmeHJrHfPPffolVdeUWpqqkpKShQVFaWvvvrKdsvHDRw4UMOGDdPgwYPVpEmTGsfeffdd3XXXXYYms8bEiRN12WWXaeTIkTVuX7RokTZv3qxZs2YZmsyMffv2aenSpcrJyZHL5dIVV1yhhIQEtWzZ0vRoRnz55Zf65z//KZfLpb59+6p79+6mR7LcmjVrNHXqVAUGBio7O1ubNm3S/PnztXDhQtOjGZGfn6+8vDzddNNNOnLkiCoqKhQeHm56LEv95je/0V//+tdz4nGBYroWcnNzNXnyZAUEBCg1NVWpqalav369wsPDtXDhQsXExJge0agbb7xRn3zyiekxLHP8unbVazcd/7Gd1nE6lW+//facWc8IDSs1NfWkm3ktWbJE33//vWbMmGFoMt9ht8fM/v37a9WqVb/4mL862T4FISEhtlgjVHLvTxAVFaUhQ4aoR48eJyxZYacrTCorK5WUlKTVq1erXbt2kqQdO3aob9++Sk1NtdXGZjt27NCwYcPUo0cPzxmA3377rT7//HO9/fbbnnxgP/v371doaKicTqcWL16skpISDR8+XBdccIHp0SzjdDr10ksvaezYsaZHMWb//v0aNmyYwsPDazxGFBcX6+2331ZUVJThCa3z85NAqpWWlio4ONhWy4PBa+jQoVq4cKFGjhypzMxMSe5lsT788EOzgxmwbNkyvfrqq54TpLZv365p06bpzTffND2apRITE2vs3+HL7POMtx6eeuopjR49WiUlJbr//vs1btw4vfrqq8rOzlZqaqrtfsF/zm7vbTRr1kzh4eEaMmSI+vTpY8tLsCXp1VdfVfPmzZWYmFjj9tdee02SbFdMd+rU6ZRrCttpTatTbeY1bNiwEzaRtSs7PmbW5Zi/GjVqlAoKChQSEiLJfYZsZGSkgoOD9dxzz6lr165mBzzLGjVqpIMHD2rRokVavHhxjX8Pvnp54dkSFBSk2bNn64cffqhx1tvFF19sejTLtWvXTsuWLdNf/vIXffbZZ5KkmJgYLVu27Jw406ehnOkF5M+fc9lBixYtPB8/+OCDBicxJzAwUF999ZXpMYxq0aKFMjMztXLlSs/j5ZAhQxQbG2u7Dc1mz5590pNA0tLStGfPHlttnCtJ27dv18svv6xdu3bVWG/8/fffNziVGT9/gyY4ONjQJGYtWbJE6enpnr+Zl156qfbv3294Kut16dJFjzzyiG699dYaa0z74jKKFNO1cOTIEfXr10+S+7LL6nKlb9++mjdvnsnRfILdNuhZvXq11q9fr4yMDC1ZskT9+vXTkCFDbHWWlyStXLnypOvo3nPPPbrzzjtPuNTO33399deS3KXj3XffrXfffdfwRGacajOvgIAA2z1WnIrdcigpKTnl5qCHDx+2eBrz+vXrpx49eng2fvzHP/6hb7/9Vtdff71mzJjh95ens0+D19atW7Vv3z716tWrRhm9du1aRUdH2+55RcuWLfWnP/3J9BhGfffdd6ZH8Dnbt2/XwoULtXPnTluXTr/+9a/1xhtvKC4ursabunYpZQ8fPqyDBw+eUMbu2rVL559/vm32MJFOfRJIYmKiBg8ebLtiurp4GzJkiK3PFm/evLn279/veZ2xfv16z0kQdtOoUaMaGypLsuXvRvVziuPXnnc4HBTT56rjz+a5/vrraxyrqqqyehwjxowZc8qzQX/66ScDE5nVo0cP9ejRQ0ePHtUHH3yg4cOH6+GHH1ZCQoLp0Sx1sndhj383zk6Of5EQGBhoyzNBJTbzqjZ06NBTPmYWFRUZmMic6OjoU24O2rp1a4unMW/Dhg16/PHHPZ/fdNNNeu211/TII4/o2LFjBiczo6ysTFu3blWbNm1st/bfnDlzTlrEhoWFafbs2bZaF3Ljxo3asmWLfve730lyP+88ePCgJOmhhx46YY16f2Wn/Qdqa+zYsRo8eLDuuOMOWxYL1arXUJ41a5YcDoftrsibOXOm+vTpozZt2tS4fePGjfriiy80bdo0Q5NZ71QngQQGBtru5AfJ3cf8v//3/0yPYdyECRM0cuRI7d69W8OHD9eOHTv08ssvmx7LiPDwcOXn53v+PSxfvtyWrznOpc1QKaZroXqH0/POO09PPfWU5/Y9e/bY5l3q3/zmN6c81qlTJwsn8R15eXnKyMjQxx9/rD59+qhHjx6mR7JUWVmZKisrT1gDs6Ki4rQ7Z9uB3ZZqON6AAQM0ceLEk27mdeuttxqezjrjx49XeXn5CW9QHD161HaX1Z1LT4qsUFVVpa+//lpXX321JOk///mP5zHTDktDff7555o2bZrCwsL06KOPaty4caqsrNTRo0f17LPPqn///qZHtMzu3btP+hyqS5cu+t///mdgInPmz5+v++67z/P5tm3b9Nhjj+no0aNatGiRbYrpU11dUs0Xz3I624KCgnT//febHsO43Nxc0yMY9eWXX560fB44cKCt3sSTOAnk57p27arc3FzbdhLVunTpoiVLlniu4u3WrZtCQ0MNT2XG5MmTNX78eOXn56tv375q0qSJ7R4nJOmLL75Q586d1bx5c6WlpWnTpk0aOXKkT244TjFdC/Pnzz/p7aGhoVqwYIHF05hxxx131Ph87969yszMVHp6ulwul63WfPvLX/6i5cuXKzg4WHfccYcyMzNt8wbF8fr06aNnn31WkyZN8rxrX1VVpVmzZql3796GpzPLjmcrVBs9erSSkpLUu3fvEzbzevjhh80OZ6G1a9eechPIPXv2qFevXoYms96Z3qiy2+NncnKyxo0bpyZNmsjhcKi0tFRz5szRkSNHNGLECNPjnXUzZ87UxIkTVVJSoj/+8Y+aP3++rr32Wn3//fd67LHHbFVMV1RU1OmYPyoqKlLPnj09nwcHB+vGG2+UdOZ1l/3Jqa4ukXz38tuzrXfv3vr000/Vp08f06PAoNO9cWu3592cBFLTt99+q2XLlumSSy6pceWu3Zb72bJli9q0aeP5O3H06FFt3bpVHTp0MDyZ9QoKCvT666/rwIEDcrlcioiI0Pfff6+LLrrI9GiWmjZtmlasWKGtW7dq8eLFGjRokB5//HEtWbLE9GgnoJiuh2bNmtnqcv3KykplZ2crPT1dGzduVGVlpd544w2/36jp56ZNm6bOnTurVatWWrNmzQlnt8ydO9fQZNYaN26c/vjHP+rmm29W586dJUk5OTlq3bq1ZwNEO+nZs6fnifGhQ4d03XXXSfJufrhu3TqT41mGzbzcWP/Pq1u3bp5/Gz+/msBOlyFX6969uz7++GPl5+fL5XLp0ksv9ZxF//M3gf1RVVWV50XTvHnzPGfCduzY0eRYRkRERCgnJ8fzN7RaTk6O7ZY1+fkyNsdfaXHgwAGrxzGGK0xOdN111+nBBx9UQECAgoODbfe8qlpubq6Sk5OVm5tb46xYO/0NLS4uVkRExAm32e1KRU4CqWny5MmmR/AJSUlJ+utf/+r5PCgoSBMnTtSyZcsMTmXGzJkzlZGR4Xm8qKqq8txmJ0FBQXI4HPr000919913a/jw4fr73/9ueqyTophGrTzzzDP64IMPdPnll+uOO+7Q3LlzNWDAANuV0tLp1/8rKCiwcBKzmjdvrrffflv//ve/PQVkYmKip5C1m5NtBGlnF198se3K6OOx/p/Xr371Kx07dkxxcXGKjY1VWFiY6ZGMczqdCg4OltPp1M6dOyXJNhvdHf/7//ONaeywlMnxHnzwQT3wwAN66KGH9H//93+SpE2bNmnBggV68sknDU9nrcDAQO3bt09RUVGS5Nmwae/evbb7vZBOvaSHHc+Ynjp1qp555hldccUVtvxdqJaSkqI//elPeuaZZ/T6669r6dKlJzyG+rM777xTY8aM0YwZMzzPL3/44QdNmTLlhKvT/N3JTgL53//+p969e5+wxKIdVL/BffToUUmy1YmDx3M6nWrUqJHn8+rnmXZU/QZmtYCAAFtmUVlZqa+++kqrVq3SjBkzJMlnc7DfIxfq5J133lG3bt00atQoz6WWditXqrGsSU29evXStdde69lg4GTrTtvBhRdeaHoE+BDW//N66623tHv3bmVkZOiuu+7S5ZdfriFDhqh37962LBmWLl2q2bNnKzw83PN31OFwaPXq1YYns8bu3bs1duzYEz6ufmFtJ71799aMGTO0YMECzZgxQw6HQ507d9a0adNstyTW7373O40bN65G6bRz5049/vjjtiudpJpLepSXl2vz5s3q3LmzLYvpsLAwWy5P8HPl5eW67rrr5HK51LJlS40bN07Dhw/XqFGjTI9mid///vcqLi7WoEGDPMs1lJWVacSIEYqNjTU8nbUmTJig+++/X506dVJYWJgGDRqkkJAQLV68WOPGjbPdY+auXbs0fvx4bd682fN3dNasWT65ju7ZFBQUpF27dnnu986dO227YWzz5s31zTff6KqrrpIkffPNN7Z8w2Ls2LGaNm2aevbsqQ4dOig/P99nTxxzuOx27Qvq5NChQ1q5cqXS09P1008/KS4uTunp6frkk09Mj2YEy5p4bdq0SWPGjPFcXllZWakXX3xRV1xxhenRLDVmzJhTvlnjcDj0wgsvWDsQjJo7d67y8vJOuv7fRRddpHHjxhme0Jx//OMfmjJliu6///4am53ZRb9+/bRkyRLbvpl1psso7bCcSbUvvvjC83H10/Hj/4786le/snwmk55//nktXrxYTZo0keQtne6//37PGdR2tW3bNi1evNhzxpOdLFy4UGFhYbrttttqrB9rt/0J7rzzTqWlpWnYsGF64okn1KpVKw0dOlTZ2dmmR7PMvn379MMPP3iWv2rRooX+/Oc/a9myZTUeT/3dgAED9OGHH0qS/vznP2vNmjVatGiR9uzZoz/+8Y9avny54Qmtde+99+r222/X0KFDJUnLli1TVlaWFi9ebHgya/3zn//UlClTPG9grlmzRk899ZRnvwY7+c9//qOHH37YczXitm3b9NJLL9myqzlXUEzjF8vNzdX777+vrKwstW/fXgMHDtRdd91leizL/HxZk5tvvlkDBgyw1RPD4911110aO3asZwmPzz//XC+88ILeffddw5NZ62Rly6FDh7RkyRIdOHDAs0My7KGyslJJSUlavXr1Cev/paam2u6qApfLpbVr1yojI0O5ubm67bbbFB8fr1atWpkezXJ33XWX7R4fj5eXl6ft27fr5ptvliQ9/fTTKikpkSTdc889iomJMTmepapfREvS9u3b1b59+xoFtd02bpLcl2Jv27ZNknt5m2bNmunGG2+07YkQx/vtb39ry9+JTp06nXCbHfcnWLx4seLi4rRp0yaNHTtWVVVVGjNmjG3e4E1LS9OTTz6p8PBwnX/++Ro/frwmTJigG264QY888oitNjW74447PK87xowZo549eyohIUGSFBcXp8zMTIPTWW/w4MEnlPEnu80O8vPz9e9//1uSdMMNN/js2bFW+Omnn7Rx40a5XC5169bNtksJfvbZZ9q8ebPKyso8tz300EMGJzo5e70yRoPo1KmTnnjiCU2cOFEff/yx5/Jsu2BZk5pKS0trrCvds2dPlZaWGpzIjOPP8isvL9eSJUv05ptvqn///rZb3gVsAnm8WbNm6R//+Ie6deumu+66Sz169DA9klG9evXSzJkzdfvtt9c4+88ua0zPmzdPAwYM8Hy+Zs0a3XPPPTp69KheffVVPf/88wans9bxexPExcXZsnT8uWbNmqlLly41brPjOTTHrzFdVVWlTZs2qaqqyuBE5uTm5poewSfce++9kqQ+ffpow4YNKisr81yRZQdvvvmmMjIy1KFDB3311Vf6/e9/r9mzZ9t2mZfCwkKFhYVpw4YNGjNmjOf248snuwgICND27dt16aWXSnKXs3ZdwuKSSy7RJZdcYnoMnxAWFmbL5a+ON3v2bG3atEnbtm1Tv379tHr1ap/dD4xiGnXWqFEjDRgwoMYLTDv47LPPtHLlSs2cOdOzrImvLiJvhaZNm+rzzz/3lPQbNmyw3eWV1aqqqpSWlqaXX35Z1157rd599121adPG9FgwyO6bQErSG2+8oUsuuURbt27VrFmzTjhutzKu+kym43fFttMa0zt37lT//v09nzdt2lSJiYmS5Pm/dmTnN7jPxI7ZHL/GdFBQkNq2bau5c+canMisAwcO6JtvvpHD4dBVV12l8PBw0yNZpvoKglOxy5uaQUFB6tChgyTpmmuuUZs2bWxbSo8aNUpxcXFq1KiRrrnmGs/vwMaNG3XBBRcYns5648aNU2JioueKq9zcXM2cOdPwVNb7+uuvNWvWLO3atUtOp9OzAeC6detMjwZD1qxZo4yMDA0ZMkTTpk3T6NGjfXZzbYpp4BcKDQ1VYmKiEhMTPcuaHDt2TImJibZb1kSSJk+erLFjx3rWe6uoqNC8efMMT2W9v/3tb5o7d64uueQSvfbaa54nz4DdLVq0SOXl5SdsOnL06FHP44ad2HXZp2qVlZU1Pp8zZ47n40OHDlk9DnzE6cq3n//O2MFbb71legSfsXbtWj366KOe0un777/XrFmzdP311xuezBqn29zQTm9qVlRUKC8vz3MFRUBAQI3P7VLQS9Jtt92m7t27a//+/TWWuomOjtb06dMNTmZGnz599MEHH+ibb76Ry+VS165dFRERYXosyz3++ON68MEH1bVrV1tuLo4TBQcHKygoSA6HQxUVFWrVqpX27NljeqyTopgG6sHuy5pIUpcuXfTRRx8pPz9fLpdLl156qRo1amR6LMuNGzdOF1xwgRo1aqSXXnrphON2PtMJ9rZ27VpdeumlJ+wSv2TJEu3Zs0e9evUyNJm1ysvLFRwcfMqljuxypUlFRYUOHz7suQS9ffv2kqTDhw+rvLzc5GiWO76MLSsrq1GySPYqWk5Xvh2/5I2/W7p06WmP2/Gqgueff15Lly71PFbk5eXp0UcftU0xbfc3M6sdO3ZMI0eOrHFb9ed2KuirRUVFKSoqqsZtdty34+fsXMg2adJEAwcOND0GfEjz5s1VWlqqbt26KSkpSVFRUT67zA2bHwKot9LSUu3Zs6fGkiZ2ekEtuXeAPt3lxsevQQ3Yye23364VK1ac8ETI6XRq8ODBysrKMjSZtao3K+rUqZMcDkeNAtJOG3m9+OKL2rp1q55++mlPOX348GE98cQTuuSSSzR27FjDE1qnb9++pzxmx6IF0qRJkyS5l67YsGGDZy3IdevW6YYbbrDlFWmDBg3SihUrznibvzrTvi12eVMTOJWPPvpIU6ZM0ZVXXqmqqirl5uZq+vTpuummm0yPZqnnn39eV199te3XVYbX/v37FRoaKqfTqcWLF6ukpETDhw/3ySV/KKYB1MvSpUs1e/ZshYeHe4pZO76gdjqd+uijjxQWFqZevXrpz3/+s9atW6d27dpp9OjRCgkJMT0iYMTAgQO1cuXKX3wM/qmyslJJSUlavXq12rVrJ0nasWOH+vXrp2effVZBQVzMB4wePVpJSUlq27atJGnXrl167rnnbLU5aLURI0Zo0KBBGjJkiCQpIyNDy5cv15tvvml2MIuc7M3ManZ6UxM4ldtuu00LFizwbPq3Y8cOPfDAA/rb3/5meDJr9ezZUwcPHlTz5s0VHBzMGtM4p/DsH0C9LFq0SFlZWbrwwgtNj2LUk08+qS1btqi8vFxt2rRRWVmZbrzxRn3xxReaOnWqLV9MApJ7CYvS0tITzuo6cuSI7ZZukKS0tDRdd911tt0YNSgoSLNnz9YPP/ygnJwcSVLnzp1tv0kocLz//e9/nlJaktq2bav8/HyDE5kzbdo0TZgwQcnJyXI4HIqJiTnpRrr+Kjc31/QIgE8LCwvzlNKS1K5dO1ttkFotPT3d9AjwEWPGjDntldy+uMQoxTSAeomKirJ9KS1JX375pT744AOVlpbqhhtu0Oeff67g4GDFx8dr0KBBpscDjBkwYIAmTpxYY+mGkpISTZ06Vbfeeqvh6ayXk5OjN954QxUVFerZs6euu+469ezZUy1atDA9mqUuvvhiymjgFMLDwzV//nzP2vzp6em2LFok6aKLLtJ7772nI0eOyOVyef6OALC36mVubrjhBr388sv67W9/K5fLpWXLlunmm282PJ31eD2Oar/5zW9Mj/CLsZQHgHqZN2+ejh07pttvv73GBkV2W2M6Li5OmZmZJ3wsedeWBezoVEs39O3bV6mpqbZdumHPnj365JNP9Oqrr6qgoIDLsQF4FBYWasaMGVq/fr0k9yXakydPtu3mZjt37tTOnTtr7GVil3VUf//73+vPf/6zevbsWeMMOC7Th92xzE1NBQUFmjVrlnJzc1VWVua53W7La+LcZM9XgwAaTHUB+/e//91zmx3XmC4vL1deXp5cLleNjyXVeHIA2M3Pl25wuVy64oorbHu27Hfffad169Zp3bp12rdvn2644QbPBmcAIEmtWrWy5UaHJzNnzhylpaWpffv2CggIkOR+nmmXYrp62RIu0wdqYpmbmiZPnqwBAwZo8+bNmj17tt555x1ddNFFpseCQXv37tVTTz2l9evXy+FwqEePHnr88cfVsmVL06OdgDOmAaAB9O3b95TH7FjUAzi5Tp06qVu3bho/fry6d+9uehwAPqi0tFSvvPKKdu3apTlz5igvL0/5+fm66aabTI9muZtvvlkZGRks4SGpoqJC+fn5cjgcuuSSS2x7xRGAE1VfsVu9sXhVVZVGjBihJUuWmB4NhowYMULdu3evsSzYhg0bfHLzYP6aAai3devWKS8vT8OGDVNRUZEOHTpUYxMKO8jOzjY9AoBzwHvvvafPP/9c8+fPV1FRka6++mr16tVLt9xyi+nRAPiIlJQURUVFec4IbN26tcaPH2/LYjoqKopSWu69TMaPH68mTZp4rs577rnndPXVV5seDYAPaNSokSSpWbNm+vHHH9WiRQv9+OOPhqeCSQcOHNBDDz3k+fzBBx/UqlWrDE50ahTTAOrl1Vdf1Zo1a7Rv3z4NGzZMFRUVmjx5st555x3TowGAz+nSpYu6dOmiwYMH65NPPtErr7yiv/71r7ZbCxHAqW3ZskWpqan67LPPJEnNmzdXVVWV4anM6Nq1qx555BHdeuutNfYysctSHtWmTZum2bNn61e/+pUkd1GdkpKiFStWGJ4MgC/o3r27Dh48qLvvvltDhgxRcHCwLTcZh9fFF1+sH374wbN84s6dO9WhQwfDU50cxTSAesnKylJ6errnEpHWrVvr8OHDhqcCAN80bdo0rVu3TmVlZerZs6f+9Kc/qWfPnqbHAuBDqs98q1ZWVnbSDb7sYNOmTZKkt956y3ObndaYrta4cWNPKS25S6gmTZoYnAiAL+ndu7cCAwMVFxena6+9Vj/++KMqKipMjwWDDh8+rMGDB+uaa66RJH399df61a9+pbFjx0qS5s6da3K8GiimAdRLkyZNTngBdfyu4QAA6ccff9QFF1ygjh076t5771Xbtm09x/773//65EYkAMzo3r27Fi5cqPLycq1fv16LFy8+7V4W/uz4QtrOunfvrhUrVmjQoEGSpJUrV6pPnz6GpwLgK2bOnKmMjAxJ0gUXXKDWrVtr6NChnttgPwMHDtTAgQM9n8fGxhqc5vQopgHUS+vWrfXll1/K4XCoqqpKCxcu9NlLRADAlNGjRysjI0Px8fH67W9/q/fff99z7IknnuCFAwCPcePG6fXXX1fz5s01e/Zs9e3bV6NGjTI9ljElJSXKz89XWVmZ57bjzx62g4yMDC1evFhPPPGEJKm8vFzh4eF6++235XA4tG7dOsMTAjDJ5XLVODksICBATqfT4EQw7Y477jA9Qq1RTAOolylTpmjixInaunWrrrrqKnXv3l2zZ882PRYA+JTjL8OvrKw85TEA9rV06VLPx6GhoYqLi/N8/u677yoxMdHAVGZ9+OGHSk1N1aFDh9SyZUvt3LlTnTp1st2beenp6aZHAODDmjdvrm+++UZXXXWVJOmbb75Rs2bNDE8Fk4qLizV9+nStW7dODodDvXr10uOPP66IiAjTo52AYhpAvURFRWnRokUqLS1VVVWVmjdvbnokAPA5x5/F8vPljlj+CIAkTZ8+XVdeeSVXnh1n4cKFWrZsme677z5lZmbqX//6lz766CPTY1nuwgsvND0CAB/26KOPavTo0brsssskSdu2bdNLL71keCqYlJycrMsuu0xJSUlyuVx67733NHXqVJ/8vaCYBlAna9asOe1xu21KAwCnU1ZWpry8PLlcrhofVx8DgBkzZigzM1Pbtm1TXFycYmNjFRYWZnoso4KCghQZGem5JP3666/Xiy++aHgq6xUUFGjWrFnKzc2t8Tdj9erVBqcC4Cu6deumDz74QBs3bpTL5VK3bt1s//fD7nbu3Fnj7+WYMWM0ePBggxOdGsU0gDp5/fXXJbnXuNu0aZMuv/xySdKWLVvUtWtXimkAOM6xY8c0cuRIz+fHf8wZ0wAkaejQoRo6dKh2796tjIwM3X333br88sv1wAMPqGPHjqbHMyI4OFgul0sXX3yx3nrrLV144YU6cOCA6bEsN3nyZA0YMECbN2/W7Nmz9c477+iiiy4yPRYAHxIWFsZrcHhUVVWpqKhIkZGRkqSioiJVVVUZnurkKKYB1En1LumPPvqoJk+e7FnP6ttvv62xqRcAQMrOzjY9AoBzRJs2bTRixAi1aNFC8+bN0/XXX2/bYnrs2LE6fPiwJkyYoJSUFJWUlCg5Odn0WJY7cOCA7rzzTi1ZskTdunXTVVddpREjRpgeCwDgo+677z7FxcXpxhtvlMPh0Jo1a/TII4+YHuukKKYB1EteXp6nlJakLl26KCUlxdxAAAAA5yCXy6W1a9dq2bJl2rJli2677Ta99957atu2renRjLnuuuskSSEhIXrzzTfNDmNQo0aNJEnNmjXTjz/+qBYtWujHH380PBUAwFfFxcWpc+fO2rBhg1wul+655x7PGuS+hmIaQL0EBQVp+fLlnvWKVqxYoaAgHloAAAB+iT59+igqKkpDhgzR6NGj5XA4VFZWpm3btkmSz76gPBuWLl162uOJiYkWTWLewYMHdd555+mnn35SQkKChgwZouDgYN16662mRwMA+LALLrhA3bp10xVXXGF6lNNyuKp33gGAOti2bZsee+wxbd26VQEBAerQoYOSkpLUvXt306MBAACcM/r27ev52OFw6PiXaQ6Hw1Yb3U2aNOm0x5955hmLJjHrww8/1KRJk9S8eXOVl5frxRdf1MUXX6zDhw979ncBAODn1qxZo6lTpyowMFDZ2dnatGmT5s+fr4ULF5oe7QQU0wAaxI8//qiMjAytWLFCLpdLH330kemRAAAAgHPWwIEDNXPmTMXExOjzzz/X/PnzPfu8AABwKkOHDtXChQs1cuRIZWZmSpIGDBigDz/80OxgJ8H19gDqrLKyUtnZ2UpPT9fGjRtVWVmpRYsW1VhzGgAAAKiL0tJSvfLKK9q1a5fmzJmjvLw85efn66abbjI9miUCAgIUExMjSerZs6eeffZZwxMBAM4VUVFRNT4PDg42NMnpBZgeAMC56ZlnntGNN96od999V7GxsVqzZo3CwsIopQEAANAgUlJSVFlZqdzcXElS69at9dJLLxmeyjoVFRXKy8vTtm3btG3bNpWXl9f4HACAk2nevLn2798vh8MhSVq/fr1CQkIMT3VynDENoE7eeecddevWTaNGjVLPnj0lyfOgBwAAANTXli1blJqaqs8++0yS+4V2VVWV4amsc+zYMY0cObLGbdWf223dcQBA7U2YMEEjR47U7t27NXz4cO3YsUMvv/yy6bFOimIaQJ189tlnWrlypWbOnKmffvpJcXFxcjqdpscCAACAn2jUqFGNz8vKymSnLZKys7NNjwAAOAd16dJFS5Ys0ddffy1J6tatm0JDQw1PdXJsfgig3nJzc/X+++8rKytL7du318CBA3XXXXeZHgsAAADnsJkzZyo0NFQrVqxQcnKyFi9erI4dO2rcuHGmRwMAwOeVl5fXOIGwadOmBqc5OYppAA2moqJCH3/8sTIyMvTaa6+ZHgcAAADnsIqKCr3++uueM4f79u2rUaNGKTAw0PBkAAD4ro8++khPPfWU9u3bJ0lyuVxyOBzavHmz4clORDENAAAAAPAZS5cuPe3xxMREiyYBAODcc/PNNys1NVVdu3ZVQECA6XFOizWmAQAAAAA+Y/r06bryyivVoUMH06MAAHDOCQsL09VXX216jFrhjGkAAAAAgM9IT09XZmamjh07pri4OMXGxiosLMz0WAAAnBNeeeUVhYSEaMCAAWrcuLHndtaYBgAAAACgFnbv3q2MjAz97W9/0+WXX64HHnhAHTt2ND0WAAA+rVOnTp6PHQ6HT68xzVIeAAAAAACf06ZNG40YMUItWrTQvHnzdP3111NMAwBwBrm5uaZHqDXOmAYAAAAA+AyXy6W1a9dq2bJl2rJli2677TbFxcWpbdu2pkcDAAANiGIaAAAAAOAzevfuraioKA0ZMkQ9evSQw+Gocfyyyy4zNBkAAGhIFNMAAAAAAJ/Rt29fz8fVa2Me//nq1atNjAUAABoYxTQAAAAAAAAAwFIBpgcAAAAAAAAAANgLxTQAAAAAAAAAwFIU0wAAAAAAAAAAS1FMAwAAAAAAAAAs9f8BHyqn5Q0U+4gAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 1800x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(25, 5))\n",
    "sns.set_theme()\n",
    "profile_mut_all[\"tumor_types\"].value_counts().sort_index().plot(kind=\"bar\")\n",
    "#sns.countplot(x=profile_mut_all[\"tumor_types\"], palette=sns.hls_palette(2))\n",
    "plt.xticks(rotation=90);\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check activites data content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 425,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Some content from the full act set:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SBS1</th>\n",
       "      <th>SBS2</th>\n",
       "      <th>SBS3</th>\n",
       "      <th>SBS4</th>\n",
       "      <th>SBS5</th>\n",
       "      <th>SBS6</th>\n",
       "      <th>SBS7a</th>\n",
       "      <th>SBS7b</th>\n",
       "      <th>SBS7c</th>\n",
       "      <th>SBS7d</th>\n",
       "      <th>...</th>\n",
       "      <th>SBS52</th>\n",
       "      <th>SBS53</th>\n",
       "      <th>SBS54</th>\n",
       "      <th>SBS55</th>\n",
       "      <th>SBS56</th>\n",
       "      <th>SBS57</th>\n",
       "      <th>SBS58</th>\n",
       "      <th>SBS59</th>\n",
       "      <th>SBS60</th>\n",
       "      <th>tumor_types</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mut_tri</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30</td>\n",
       "      <th>30</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30</td>\n",