Skip to content
Snippets Groups Projects
MultiClass_MolBi.ipynb 1.58 MiB
Newer Older
jpronkko's avatar
jpronkko committed

       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C&gt;A</td>\n",
       "      <td>ACA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C&gt;A</td>\n",
       "      <td>ACC</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 9693 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  Mutation type Trinucleotide  ALL::TARGET-10-PAIXPH-03A-01D  \\\n",
       "0           C>A           ACA                              0   \n",
       "1           C>A           ACC                              0   \n",
       "\n",
       "   ALL::TARGET-10-PAKHZT-03A-01R  ALL::TARGET-10-PAKMVD-09A-01D  \\\n",
       "0                              0                              0   \n",
       "1                              0                              0   \n",
       "\n",
       "   ALL::TARGET-10-PAKSWW-03A-01D  ALL::TARGET-10-PALETF-03A-01D  \\\n",
       "0                              1                              0   \n",
       "1                              1                              0   \n",
       "\n",
       "   ALL::TARGET-10-PALLSD-09A-01D  ALL::TARGET-10-PAMDKS-03A-01D  \\\n",
       "0                              0                              0   \n",
       "1                              0                              0   \n",
       "\n",
       "   ALL::TARGET-10-PAPJIB-04A-01D  ...  Head-SCC::V-109  Head-SCC::V-112  \\\n",
       "0                              2  ...                0                0   \n",
       "1                              0  ...                1                0   \n",
       "\n",
       "   Head-SCC::V-116  Head-SCC::V-119  Head-SCC::V-123  Head-SCC::V-124  \\\n",
       "0                0                0                0                0   \n",
       "1                0                0                0                0   \n",
       "\n",
       "   Head-SCC::V-125  Head-SCC::V-14  Head-SCC::V-29  Head-SCC::V-98  \n",
       "0                0               0               0               1  \n",
       "1                0               1               0               0  \n",
       "\n",
       "[2 rows x 9693 columns]"
      ]
     },
     "execution_count": 358,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "other_wes_mut = pd.read_csv(\"./project_data/catalogs/WES/WES_Other.96.csv\")\n",
    "other_wes_mut.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 359,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cancer Types</th>\n",
       "      <th>Sample Names</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>SBS1</th>\n",
       "      <th>SBS2</th>\n",
       "      <th>SBS3</th>\n",
       "      <th>SBS4</th>\n",
       "      <th>SBS5</th>\n",
       "      <th>SBS6</th>\n",
       "      <th>SBS7a</th>\n",
       "      <th>...</th>\n",
       "      <th>SBS51</th>\n",
       "      <th>SBS52</th>\n",
       "      <th>SBS53</th>\n",
       "      <th>SBS54</th>\n",
       "      <th>SBS55</th>\n",
       "      <th>SBS56</th>\n",
       "      <th>SBS57</th>\n",
       "      <th>SBS58</th>\n",
       "      <th>SBS59</th>\n",
       "      <th>SBS60</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ALL</td>\n",
       "      <td>TARGET-10-PAIXPH-03A-01D</td>\n",
       "      <td>0.529</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ALL</td>\n",
       "      <td>TARGET-10-PAKHZT-03A-01R</td>\n",
       "      <td>0.696</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 68 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  Cancer Types              Sample Names  Accuracy  SBS1  SBS2  SBS3  SBS4  \\\n",
       "0          ALL  TARGET-10-PAIXPH-03A-01D     0.529     0     0     0     0   \n",
       "1          ALL  TARGET-10-PAKHZT-03A-01R     0.696     0     0     0     0   \n",
       "\n",
       "   SBS5  SBS6  SBS7a  ...  SBS51  SBS52  SBS53  SBS54  SBS55  SBS56  SBS57  \\\n",
       "0     0     0      0  ...      0      0      0      1      0      0      0   \n",
       "1     0     0      0  ...      0      0      0      1      0      0      0   \n",
       "\n",
       "   SBS58  SBS59  SBS60  \n",
       "0      0      0      0  \n",
       "1      0      0      0  \n",
       "\n",
       "[2 rows x 68 columns]"
      ]
     },
     "execution_count": 359,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "other_wes_act = pd.read_csv(\"./project_data/activities/WES/WES_Other.activities.csv\")\n",
    "other_wes_act.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imports and helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 360,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn\n",
    "from sklearn.decomposition import PCA\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "#import torch \n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import label_binarize\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "\n",
    "def plot_confusion_mat(y_test, y_pred, labs=None, size=None, title=None):\n",
    "    \"\"\"\n",
    "    prepare_mut_df plots a confucion matrix.\n",
    "\n",
    "    :param y_test: a vector containing numerically encoded label values used in model test \n",
    "    :param y_pred: a vector containing numerically encoded label values from prediction\n",
    "    :param pre_sample_limit: a lits of labeling strings for plotting\n",
    "    :param size: a tuple containing x,y size of the plot\n",
    "    :param title: a title for the whole plot\n",
    "    :return: no value\n",
    "    \"\"\"\n",
    "\n",
    "    plt.figure(figsize=(12,10))\n",
    "    \n",
    "    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)\n",
    "    if size is not None:\n",
    "        plt.figure(figsize=size)\n",
    "    if labs is None:\n",
    "        sns.heatmap(cm, square=False, annot=True, fmt='d', cmap='viridis', cbar=True)\n",
    "    else:\n",
    "        sns.heatmap(cm, square=False, annot=True, fmt='d', cmap='viridis', xticklabels=labs, yticklabels=labs, cbar=True)\n",
    "    \n",
    "    #plt.xlabel('Predicted label')\n",
    "    #plt.ylabel('True label')\n",
    "\n",
    "    if title is not None:\n",
    "        plt.title(title)\n",
    "    #plt.ylim(0, 2)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dataset preprocess routines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def prepare_mut_df(raw_mutation_dfs, is_profile, pre_sample_limit=None, small_sample_limit=None, grouping=True):\n",
    "    \"\"\"\n",
    "    prepare_mut_df prepares a data set for further analysis from WGS & WES profile or acitvity catalogs.\n",
    "\n",
    "    :param raw_mutation_dfs: a list containing data frames to combine \n",
    "    :param is_profile: True/False wheter we are dealing with profile data, if false activities data is asumed\n",
    "    :param pre_sample_limit: cull smaller than the specified limit samples per tumor type before grouping\n",
    "    :param small_sample_limit: cull smaller than the specified limit samples per tumor type after grouping\n",
    "    :param True/False wheter to do tissue related grouping or not\n",
    "    :return: Pandas dataframe with samples in rows, mutations in columns as wella as tissue grouping column tumor_type\n",
    "    \"\"\" \n",
    "    mutations_all = pd.DataFrame()\n",
    "\n",
    "    #######################################\n",
    "    # Combining data and rename some items\n",
    "    #######################################\n",
    "\n",
    "    for df in raw_mutation_dfs:\n",
    "        # Make a copy of the original data frame and start processing from there\n",
    "        mutations  = df.copy()\n",
    "    \n",
    "        if is_profile:\n",
    "            mutations['mut_tri'] = mutations.apply(lambda a: '{}_{}'.format(a['Mutation type'], a['Trinucleotide']), axis=1)\n",
    "            mutations = mutations.set_index('mut_tri').drop(['Mutation type', 'Trinucleotide'], axis=1)\n",
    "            mutations = mutations.T\n",
    "        else:\n",
    "            mutations['mut_tri'] = mutations.apply(lambda a: '{}::{}'.format(a['Cancer Types'], a['Sample Names']), axis=1)\n",
    "            mutations = mutations.set_index('mut_tri').drop(['Cancer Types', 'Sample Names', 'Accuracy'], axis=1)\n",
    "     \n",
    "        # Rename some index names\n",
    "        renamed_items = list(mutations.index)\n",
    "        index_items = list(mutations.index)\n",
    "   \n",
    "        # Combine rows for low count labels\n",
    "        for i in range(len(index_items)):\n",
    "            result = index_items[i]    \n",
    "            renamed_items[i] = result.replace('Ca', 'CA')\n",
    "       \n",
    "        mutations.rename(index=dict(zip(index_items, renamed_items)), inplace = True)\n",
    "   \n",
    "        # Normalize \n",
    "        row_sums = mutations.sum(axis=1)\n",
    "        mutations = mutations.divide(row_sums, axis = 0)\n",
    "\n",
    "        mutations_all = pd.concat([mutations_all, mutations])\n",
    "\n",
    "    # Figure out tumor types based on the first part of the index\n",
    "    tumor_types = [a.split(':')[0] for a in mutations_all.index]\n",
    "    \n",
    "    # Form an additional column of the types to the data frame\n",
    "    mutations_all[\"tumor_types\"] = tumor_types\n",
    "\n",
    "    ##############################################\n",
    "    # Pre culling of samples based on small counts\n",
    "    ##############################################\n",
    "\n",
    "    print(\"Pre cull dim\", mutations_all.shape)\n",
    "    if pre_sample_limit is not None:\n",
    "        mutations_all = cull_small_sample_counts(mutations_all, pre_sample_limit)\n",
    "\n",
    "    tumor_types = mutations_all[\"tumor_types\"] \n",
    "    print(\"After pre cull dim\", mutations_all.shape)\n",
    "    \n",
    "\n",
    "    ################################################\n",
    "    # Grouping (done changing the tumor_type column)\n",
    "    ################################################\n",
    "    if grouping:\n",
    "        def substitute(name):\n",
    "            tissue_groups = ['Bone', 'Breast', 'Cervix', 'CNS', 'Eye', 'Liver', 'Lymph', 'Lung', 'Kidney', 'Myeloid', 'Panc' ]\n",
    "            \n",
    "            for to_sub in tissue_groups:\n",
    "                name = re.sub( to_sub + r'(-\\w*)', to_sub, name)\n",
    "            return name\n",
    "\n",
    "        # Combine rows for low count labels\n",
    "        mutations_all[\"tumor_types\"] = mutations_all[\"tumor_types\"].apply(substitute)\n",
    "    \n",
    "    \n",
    "    #########################################################\n",
    "    # Post grouping culling of samples based on sample size\n",
    "    #########################################################\n",
    "\n",
    "    if small_sample_limit is not None:\n",
    "        mutations_all = cull_small_sample_counts(mutations_all, small_sample_limit)\n",
    "\n",
    "    print(\"After post cull dim\", mutations_all.shape)\n",
    "   \n",
    "    mutations_all.sort_index(inplace=True)\n",
    "\n",
    "    tumor_types = mutations_all[\"tumor_types\"] \n",
    "\n",
    "    # Prepare a list with all the types appearing only once\n",
    "    unique_tumor_types = sorted(list(set(tumor_types)))\n",
    "    \n",
    "    return (mutations_all, unique_tumor_types)\n",
    "\n",
    "\n",
    "def cull_small_sample_counts(mutations, small_sample_limit):\n",
    "    \"\"\"\n",
    "    prepare_mut_df prepares a data set for further analysis from WGS & WES profile or acitvity catalogs.\n",
    "\n",
    "    :param raw_mutation_dfs: a list containing data frames to combine \n",
    "    :param is_profile: True/False wheter we are dealing with profile data, if false activities data is asumed\n",
    "    :param pre_sample_limit: cull smaller than the specified limit samples per tumor type before grouping\n",
    "    :param small_sample_limit: cull smaller than the specified limit samples per tumor type after grouping\n",
    "    :param True/False wheter to do tissue related grouping or not\n",
    "    :return: Pandas dataframe with samples in rows, mutations in columns as wella as tissue grouping column tumor_type\n",
    "    \"\"\"\n",
    "    counts = mutations[\"tumor_types\"].value_counts()\n",
    "    big_counts = counts[list(counts > small_sample_limit)]\n",
    "    big_index = mutations[\"tumor_types\"].isin(list(big_counts.index))\n",
    "    mutations = mutations[big_index]\n",
    "\n",
    "    return mutations\n",
    "\n",
    "def print_dset_diag(mut_df, unique_tumor_types, small_sample_limit):\n",
    "    \"\"\"\n",
    "    print_dset_diag prints textual diagnostics of a profile or activities data frame\n",
    "\n",
    "    :param mut_df: the data frame to use to print info \n",
    "    :param unique_tumor_types: an array of the tumor types obtained previously\n",
    "    :param small_sample_limit: an integer of the cut off limit used in the data processing step\n",
    "    :return: no value\n",
    "    \"\"\"\n",
    "    # Check if the data frame is ok\n",
    "    print(\"\\n---Data set diagnostics print---\\n\")\n",
    "    print(\"Missing entries in mutations:\", mut_df.isnull().sum().sum())\n",
    "    print(\"The shape of the mutations data frame\", mut_df.shape)\n",
    "\n",
    "    # Check to see if the rows are normalized to one, take a sample from the data frame\n",
    "    norm_df = mut_df.sample(n=5, random_state=5)\n",
    "    print(\"Checking normalization: sum of some rows:\\n\", norm_df.iloc[:,0:-1].sum(axis=1))\n",
    "    print(\"\\n\")\n",
    "\n",
    "    # Check some counts of tumor types\n",
    "    tumor_counts = mut_df[\"tumor_types\"].value_counts() #.sort_values(ascending=True)\n",
    "    print(\"Tumor counts:\\n\", tumor_counts)\n",
    "    print(\"\\n\")\n",
    "\n",
    "    small_counts = tumor_counts < 1.5*small_sample_limit\n",
    "    print(\"Tumor types with smallish counts:\",  sum(small_counts))\n",
    "\n",
    "    print(tumor_counts[small_counts])\n",
    "    print(\"\\n\")\n",
    "\n",
    "    # Tumor types\n",
    "    print(\"Unique tumor types: \", len(unique_tumor_types))\n",
    "    print(unique_tumor_types)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Createa a profile set with no grouping and no limits for reference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 362,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pre cull dim (23829, 97)\n",
      "After pre cull dim (23829, 97)\n",
      "After post cull dim (23829, 97)\n",
      "Profile data:\n",
      "\n",
      "---Data set diagnostics print---\n",
      "\n",
      "Missing entries in mutations: 0\n",
      "The shape of the mutations data frame (23829, 97)\n",
      "Checking normalization: sum of some rows:\n",
      " Thymoma::TCGA-4V-A9QI-01A-11D-A423-09    1.0\n",
      "CNS-Medullo::SP107464                    1.0\n",
      "Prost-AdenoCA::SP114926                  1.0\n",
      "CNS-Medullo::SP78663                     1.0\n",
      "Sarcoma-bone::IC086T_WGS                 1.0\n",
      "dtype: float64\n",
      "\n",
      "\n",
      "Tumor counts:\n",
      " Breast-cancer       1637\n",
      "Liver-HCC           1318\n",
      "ColoRect-AdenoCA    1185\n",
      "Prost-AdenoCA       1091\n",
      "Skin-Melanoma       1070\n",
      "                    ... \n",
      "Myeloid-MDS            4\n",
      "Eye-RB                 4\n",
      "Breast-DCIS            3\n",
      "Cervix-AdenoCA         2\n",
      "Bone-cancer            2\n",
      "Name: tumor_types, Length: 82, dtype: int64\n",
      "\n",
      "\n",
      "Tumor types with smallish counts: 0\n",
      "Series([], Name: tumor_types, dtype: int64)\n",
      "\n",
      "\n",
      "Unique tumor types:  82\n",
      "['ALL', 'AML', 'Adrenal-neoplasm', 'Biliary-AdenoCA', 'Bladder-TCC', 'Blood-CMDI', 'Bone-Benign', 'Bone-Epith', 'Bone-Osteosarc', 'Bone-cancer', 'Breast-AdenoCA', 'Breast-DCIS', 'Breast-Fibroadenoma', 'Breast-LobularCA', 'Breast-cancer', 'CNS-GBM', 'CNS-LGG', 'CNS-Medullo', 'CNS-Oligo', 'CNS-PiloAstro', 'CNS-glioma-NOS', 'Cervix-AdenoCA', 'Cervix-CA', 'Cervix-SCC', 'ColoRect-AdenoCA', 'ColoRect-Adenoma', 'DLBC', 'Eso-AdenoCA', 'Eso-SCC', 'Ewings', 'Eye-Melanoma', 'Eye-RB', 'Head-SCC', 'Kidney-ChRCC', 'Kidney-NOS', 'Kidney-Papillary', 'Kidney-RCC', 'Kidney-Wilms', 'Liver-Benign', 'Liver-HCC', 'Lung-AdenoCA', 'Lung-CArcinoid', 'Lung-NOS', 'Lung-SCC', 'Lung-Small', 'Lymph-BNHL', 'Lymph-CLL', 'Lymph-NOS', 'Lymph-TNHL', 'Lymph-cHL', 'Meninges-Meningioma', 'Mesothelium-Mesothelioma', 'Myeloid-AML', 'Myeloid-MDS', 'Myeloid-MPN', 'Neuroblastoma', 'Oral-SCC', 'Ovary-AdenoCA', 'Panc-AdenoCA', 'Panc-Endocrine', 'Panc-Other', 'Para-AdenoCA', 'Para-Adenoma', 'Pheochromocytoma', 'Pit-All', 'Prost-AdenoCA', 'Prost-Adenoma', 'Sarcoma', 'Sarcoma-bone', 'Skin-BCC', 'Skin-Melanoma', 'Skin-SCC', 'Small-Intestine-carcinoid', 'SoftTissue-Leiomyo', 'SoftTissue-Liposarc', 'Stomach-AdenoCA', 'Testis-CA', 'Thy-AdenoCA', 'Thymoma', 'Transitional-cell-carcinoma', 'UCS', 'Uterus-AdenoCA']\n"
     ]
    }
   ],
   "source": [
    "profile_raw_data_sets = [PCAWG_wgs_mut, TCGA_wes_mut, nonPCAWG_wgs_mut, other_wes_mut]\n",
    "\n",
    "profile_all_no_grp, prf_unique_tumor_types_no_grp = prepare_mut_df(profile_raw_data_sets, True, pre_sample_limit = 0, small_sample_limit = 0, grouping=False)\n",
    "\n",
    "# Print some diagnostics from the prepared data set\n",
    "print(\"Profile data:\")\n",
    "print_dset_diag(profile_all_no_grp, prf_unique_tumor_types_no_grp, 0)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a profile data set with tissue grouping\n",
    "\n",
    "From all profile sets, a combined data frame is made, which has samples in the rows and features in the columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pre cull dim (23829, 97)\n",
      "After pre cull dim (23814, 97)\n",
      "After post cull dim (23678, 97)\n",
      "Profile data:\n",
      "\n",
      "---Data set diagnostics print---\n",
      "\n",
      "Missing entries in mutations: 0\n",
      "The shape of the mutations data frame (23678, 97)\n",
      "Checking normalization: sum of some rows:\n",
      " ALL::SJBALL020936_D1                           1.0\n",
      "Breast-cancer::TCGA-C8-A274-01A-11D-A16D-09    1.0\n",
      "Prost-AdenoCA::TCGA-EJ-7317-01A-31D-2114-08    1.0\n",
      "Kidney-RCC::ccRCC-103                          1.0\n",
      "Eso-SCC::420                                   1.0\n",
      "dtype: float64\n",
      "\n",
      "\n",
      "Tumor counts:\n",
      " Breast                         1855\n",
      "Lung                           1668\n",
      "CNS                            1595\n",
      "Liver                          1358\n",
      "Kidney                         1269\n",
      "Lymph                          1192\n",
      "ColoRect-AdenoCA               1185\n",
      "Panc                           1157\n",
      "Prost-AdenoCA                  1091\n",
      "Skin-Melanoma                  1070\n",
      "Head-SCC                        798\n",
      "Stomach-AdenoCA                 667\n",
      "Eso-SCC                         599\n",
      "Thy-AdenoCA                     560\n",
      "AML                             556\n",
      "Ovary-AdenoCA                   549\n",
      "Uterus-AdenoCA                  548\n",
      "DLBC                            512\n",
      "Biliary-AdenoCA                 487\n",
      "Eso-AdenoCA                     486\n",
      "Transitional-cell-carcinoma     389\n",
      "Neuroblastoma                   379\n",
      "Blood-CMDI                      357\n",
      "Sarcoma                         346\n",
      "ALL                             308\n",
      "Cervix                          289\n",
      "Ewings                          275\n",
      "Adrenal-neoplasm                247\n",
      "Sarcoma-bone                    203\n",
      "Testis-CA                       191\n",
      "Pheochromocytoma                182\n",
      "Bladder-TCC                     168\n",
      "Eye                             161\n",
      "Skin-BCC                        129\n",
      "CNS-NOS                         128\n",
      "Oral-SCC                        126\n",
      "Thymoma                         123\n",
      "Mesothelium-Mesothelioma        112\n",
      "Myeloid                          67\n",
      "Bone                             65\n",
      "Meninges-Meningioma              65\n",
      "Prost-Adenoma                    63\n",
      "UCS                              57\n",
      "Skin-SCC                         46\n",
      "Name: tumor_types, dtype: int64\n",
      "\n",
      "\n",
      "Tumor types with smallish counts: 1\n",
      "Skin-SCC    46\n",
      "Name: tumor_types, dtype: int64\n",
      "\n",
      "\n",
      "Unique tumor types:  44\n",
      "['ALL', 'AML', 'Adrenal-neoplasm', 'Biliary-AdenoCA', 'Bladder-TCC', 'Blood-CMDI', 'Bone', 'Breast', 'CNS', 'CNS-NOS', 'Cervix', 'ColoRect-AdenoCA', 'DLBC', 'Eso-AdenoCA', 'Eso-SCC', 'Ewings', 'Eye', 'Head-SCC', 'Kidney', 'Liver', 'Lung', 'Lymph', 'Meninges-Meningioma', 'Mesothelium-Mesothelioma', 'Myeloid', 'Neuroblastoma', 'Oral-SCC', 'Ovary-AdenoCA', 'Panc', 'Pheochromocytoma', 'Prost-AdenoCA', 'Prost-Adenoma', 'Sarcoma', 'Sarcoma-bone', 'Skin-BCC', 'Skin-Melanoma', 'Skin-SCC', 'Stomach-AdenoCA', 'Testis-CA', 'Thy-AdenoCA', 'Thymoma', 'Transitional-cell-carcinoma', 'UCS', 'Uterus-AdenoCA']\n"
     ]
    }
   ],
   "source": [
    "small_sample_limit = 35\n",
    "pre_sample_limit = 5\n",
    "\n",
    "profile_mut_all, prf_unique_tumor_types = prepare_mut_df(profile_raw_data_sets, True, pre_sample_limit, small_sample_limit, grouping=True)\n",
    "\n",
    "# Print some diagnostics from the prepared data set\n",
    "print(\"Profile data:\")\n",
    "print_dset_diag(profile_mut_all, prf_unique_tumor_types, small_sample_limit)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dataset preprocess for activites data with grouping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 364,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pre cull dim (23829, 66)\n",
      "After pre cull dim (23814, 66)\n",
      "After post cull dim (23678, 66)\n",
      "Activities data:\n",
      "\n",
      "---Data set diagnostics print---\n",
      "\n",
      "Missing entries in mutations: 0\n",
      "The shape of the mutations data frame (23678, 66)\n",
      "Checking normalization: sum of some rows:\n",
      " mut_tri\n",
      "ALL::SJBALL020936_D1                           1.0\n",
      "Breast-cancer::TCGA-C8-A274-01A-11D-A16D-09    1.0\n",
      "Prost-AdenoCA::TCGA-EJ-7317-01A-31D-2114-08    1.0\n",
      "Kidney-RCC::ccRCC-103                          1.0\n",
      "Eso-SCC::420                                   1.0\n",
      "dtype: float64\n",
      "\n",
      "\n",
      "Tumor counts:\n",
      " Breast                         1855\n",
      "Lung                           1668\n",
      "CNS                            1595\n",
      "Liver                          1358\n",
      "Kidney                         1269\n",
      "Lymph                          1192\n",
      "ColoRect-AdenoCA               1185\n",
      "Panc                           1157\n",
      "Prost-AdenoCA                  1091\n",
      "Skin-Melanoma                  1070\n",
      "Head-SCC                        798\n",
      "Stomach-AdenoCA                 667\n",
      "Eso-SCC                         599\n",
      "Thy-AdenoCA                     560\n",
      "AML                             556\n",
      "Ovary-AdenoCA                   549\n",
      "Uterus-AdenoCA                  548\n",
      "DLBC                            512\n",
      "Biliary-AdenoCA                 487\n",
      "Eso-AdenoCA                     486\n",
      "Transitional-cell-carcinoma     389\n",
      "Neuroblastoma                   379\n",
      "Blood-CMDI                      357\n",
      "Sarcoma                         346\n",
      "ALL                             308\n",
      "Cervix                          289\n",
      "Ewings                          275\n",
      "Adrenal-neoplasm                247\n",
      "Sarcoma-bone                    203\n",
      "Testis-CA                       191\n",
      "Pheochromocytoma                182\n",
      "Bladder-TCC                     168\n",
      "Eye                             161\n",
      "Skin-BCC                        129\n",
      "CNS-NOS                         128\n",
      "Oral-SCC                        126\n",
      "Thymoma                         123\n",
      "Mesothelium-Mesothelioma        112\n",
      "Myeloid                          67\n",
      "Bone                             65\n",
      "Meninges-Meningioma              65\n",
      "Prost-Adenoma                    63\n",
      "UCS                              57\n",
      "Skin-SCC                         46\n",
      "Name: tumor_types, dtype: int64\n",
      "\n",
      "\n",
      "Tumor types with smallish counts: 1\n",
      "Skin-SCC    46\n",
      "Name: tumor_types, dtype: int64\n",
      "\n",
      "\n",
      "Unique tumor types:  44\n",
      "['ALL', 'AML', 'Adrenal-neoplasm', 'Biliary-AdenoCA', 'Bladder-TCC', 'Blood-CMDI', 'Bone', 'Breast', 'CNS', 'CNS-NOS', 'Cervix', 'ColoRect-AdenoCA', 'DLBC', 'Eso-AdenoCA', 'Eso-SCC', 'Ewings', 'Eye', 'Head-SCC', 'Kidney', 'Liver', 'Lung', 'Lymph', 'Meninges-Meningioma', 'Mesothelium-Mesothelioma', 'Myeloid', 'Neuroblastoma', 'Oral-SCC', 'Ovary-AdenoCA', 'Panc', 'Pheochromocytoma', 'Prost-AdenoCA', 'Prost-Adenoma', 'Sarcoma', 'Sarcoma-bone', 'Skin-BCC', 'Skin-Melanoma', 'Skin-SCC', 'Stomach-AdenoCA', 'Testis-CA', 'Thy-AdenoCA', 'Thymoma', 'Transitional-cell-carcinoma', 'UCS', 'Uterus-AdenoCA']\n"
     ]
    }
   ],
   "source": [
    "act_raw_data_sets = [PCAWG_wgs_act, TCGA_wes_act, nonPCAWG_wgs_act, other_wes_act]\n",
    "act_mut_all, act_unique_tumor_types = prepare_mut_df(act_raw_data_sets, False, pre_sample_limit, small_sample_limit, grouping=True)\n",
    "\n",
    "# Print some diagnostics from the prepared data set\n",
    "print(\"Activities data:\")\n",
    "print_dset_diag(act_mut_all, act_unique_tumor_types, small_sample_limit)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check profile data without grouping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 365,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Some content from the full profile set:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>mut_tri</th>\n",
       "      <th>C&gt;A_ACA</th>\n",
       "      <th>C&gt;A_ACC</th>\n",
       "      <th>C&gt;A_ACG</th>\n",
       "      <th>C&gt;A_ACT</th>\n",
       "      <th>C&gt;A_CCA</th>\n",
       "      <th>C&gt;A_CCC</th>\n",
       "      <th>C&gt;A_CCG</th>\n",
       "      <th>C&gt;A_CCT</th>\n",
       "      <th>C&gt;A_GCA</th>\n",
       "      <th>C&gt;A_GCC</th>\n",
       "      <th>...</th>\n",
       "      <th>T&gt;G_CTT</th>\n",
       "      <th>T&gt;G_GTA</th>\n",
       "      <th>T&gt;G_GTC</th>\n",
       "      <th>T&gt;G_GTG</th>\n",
       "      <th>T&gt;G_GTT</th>\n",
       "      <th>T&gt;G_TTA</th>\n",
       "      <th>T&gt;G_TTC</th>\n",
       "      <th>T&gt;G_TTG</th>\n",
       "      <th>T&gt;G_TTT</th>\n",
       "      <th>tumor_types</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>ALL::11</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>ALL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ALL::2211636</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>ALL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ALL::2211638</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>ALL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ALL::2211640</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>ALL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ALL::2211642</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>ALL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 97 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "mut_tri       C>A_ACA  C>A_ACC  C>A_ACG  C>A_ACT  C>A_CCA  C>A_CCC  C>A_CCG  \\\n",
       "ALL::11           0.0      0.0      0.0      0.0      0.0      0.0      0.0   \n",
       "ALL::2211636      0.0      0.0      0.0      0.0      0.0      0.0      0.0   \n",
       "ALL::2211638      0.0      0.0      0.0      0.0      0.0      0.0      0.0   \n",
       "ALL::2211640      0.0      0.0      0.0      0.0      0.0      0.0      0.0   \n",
       "ALL::2211642      0.0      0.0      0.0      0.0      0.0      0.0      0.0   \n",
       "\n",
       "mut_tri        C>A_CCT   C>A_GCA  C>A_GCC  ...   T>G_CTT  T>G_GTA   T>G_GTC  \\\n",
       "ALL::11       0.133333  0.066667      0.0  ...  0.066667      0.0  0.066667   \n",
       "ALL::2211636  0.000000  0.000000      0.0  ...  0.000000      0.0  0.000000   \n",
       "ALL::2211638  0.000000  0.000000      0.0  ...  0.000000      0.0  0.000000   \n",
       "ALL::2211640  0.000000  0.000000      0.0  ...  0.000000      0.0  0.000000   \n",
       "ALL::2211642  0.000000  0.000000      0.0  ...  0.000000      0.0  0.000000   \n",
       "\n",
       "mut_tri       T>G_GTG  T>G_GTT   T>G_TTA  T>G_TTC  T>G_TTG  T>G_TTT  \\\n",
       "ALL::11           0.0      0.0  0.000000      0.0      0.0      0.0   \n",
       "ALL::2211636      0.0      0.0  0.000000      0.0      0.0      0.0   \n",
       "ALL::2211638      0.0      0.0  0.333333      0.0      0.0      0.0   \n",
       "ALL::2211640      0.0      0.0  0.000000      0.0      0.0      0.0   \n",
       "ALL::2211642      0.0      0.0  0.000000      0.0      0.0      0.0   \n",
       "\n",
       "mut_tri       tumor_types  \n",
       "ALL::11               ALL  \n",
       "ALL::2211636          ALL  \n",
       "ALL::2211638          ALL  \n",
       "ALL::2211640          ALL  \n",
       "ALL::2211642          ALL  \n",
       "\n",
       "[5 rows x 97 columns]"
      ]
     },
     "execution_count": 365,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Some content from the full profile set:\")\n",
    "profile_mut_all.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 366,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 1800x360 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(25, 5))\n",
    "sns.set_theme()\n",
    "profile_all_no_grp[\"tumor_types\"].value_counts().sort_index().plot(kind=\"bar\")\n",
    "plt.xticks(rotation=90);\n",
    "plt.title(\"Tumor types in profile data without grouping\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check profile data with grouping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 367,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",