From e053100e3756284b2be0a6f60494fcca9e0b477d Mon Sep 17 00:00:00 2001 From: AlexanderKroll <74175710+AlexanderKroll@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:58:33 +0100 Subject: [PATCH] Updating jupyter notebooks --- ...substrate database from GOA database.ipynb | 2664 +++--- ...onal datasets (for re-traing ESM-1b).ipynb | 8076 +++++++++++++++++ ...> 1_2a - Loading Mou et al datasets.ipynb} | 0 ... 1_2b - Loading Yang et al datasets.ipynb} | 0 ... - Training gradient boosting models.ipynb | 1432 ++- ...g additional machine learning models.ipynb | 658 ++ notebooks_and_code/3 Plots and figures.ipynb | 932 +- ... Network (pretraining KM prediction).ipynb | 793 ++ ...4_1 - Training Graph Neural Network.ipynb} | 1159 ++- 9 files changed, 13241 insertions(+), 2473 deletions(-) create mode 100644 notebooks_and_code/1_1 - Creating enzyme-substrate database from GOA database - Additional datasets (for re-traing ESM-1b).ipynb rename notebooks_and_code/{1_2 - Loading Mou et al datasets.ipynb => 1_2a - Loading Mou et al datasets.ipynb} (100%) rename notebooks_and_code/{1_1 - Loading Yang et al datasets.ipynb => 1_2b - Loading Yang et al datasets.ipynb} (100%) create mode 100644 notebooks_and_code/2_4 - Training additional machine learning models.ipynb create mode 100644 notebooks_and_code/4_0 - Training Graph Neural Network (pretraining KM prediction).ipynb rename notebooks_and_code/{4_0 - Training Graph Neural Network.ipynb => 4_1 - Training Graph Neural Network.ipynb} (54%) diff --git a/notebooks_and_code/1_0 - Creating enzyme-substrate database from GOA database.ipynb b/notebooks_and_code/1_0 - Creating enzyme-substrate database from GOA database.ipynb index 32420ae..b9eca08 100644 --- a/notebooks_and_code/1_0 - Creating enzyme-substrate database from GOA database.ipynb +++ b/notebooks_and_code/1_0 - Creating enzyme-substrate database from GOA database.ipynb @@ -12,7 +12,10 @@ "### 4. Splitting the dataset in training and testing set\n", "### 5. Calculating enzyme and substrate representations\n", "### 6. Adding negative data points\n", - "### 7. Adding task-specific enzyme representations" + "### 7. Adding task-specific enzyme representations (extra token)\n", + "### 8. Adding task-specific metabolite representations\n", + "### 9. Adding task-specific enzyme representations (mean representations)\n", + "### 10. Adding ECFP vectors of different dimensions" ] }, { @@ -24,7 +27,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "C:\\Users\\alexk\\projects\\SubFinder\\notebooks_and_code\n" + "C:\\Users\\alexk\\projects\\ESP\\notebooks_and_code\n" ] } ], @@ -69,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -97,7 +100,6 @@ "
6587 rows × 5 columns
\n", + "6587 rows × 4 columns
\n", "" ], "text/plain": [ @@ -341,23 +346,23 @@ "6585 GO:1990888 \"Catalysis of the reaction: 2-polyprenyl-6-hyd... \n", "6586 GO:1990965 \"Catalysis of the reaction: cytosylglucuronic ... \n", "\n", - " Name RHEA ID KEGG ID \n", - "0 trans-hexaprenyltranstransferase activity 20836 NaN \n", - "1 lactase activity 10076 NaN \n", - "2 adenine deaminase activity 23688 NaN \n", - "3 peptidyltransferase activity NaN NaN \n", - "4 succinate dehydrogenase activity 16357 NaN \n", - "... ... ... ... \n", - "6582 clathrin-uncoating ATPase activity NaN NaN \n", - "6583 rRNA cytidine N-acetyltransferase activity NaN NaN \n", - "6584 2-polyprenyl-3-methyl-5-hydroxy-6-methoxy-1,4-... NaN NaN \n", - "6585 2-polyprenyl-6-hydroxyphenol O-methyltransfera... NaN NaN \n", - "6586 cytosylglucuronate decarboxylase activity NaN NaN \n", + " Name RHEA ID \n", + "0 trans-hexaprenyltranstransferase activity 20836 \n", + "1 lactase activity 10076 \n", + "2 adenine deaminase activity 23688 \n", + "3 peptidyltransferase activity NaN \n", + "4 succinate dehydrogenase activity 16357 \n", + "... ... ... \n", + "6582 clathrin-uncoating ATPase activity NaN \n", + "6583 rRNA cytidine N-acetyltransferase activity NaN \n", + "6584 2-polyprenyl-3-methyl-5-hydroxy-6-methoxy-1,4-... NaN \n", + "6585 2-polyprenyl-6-hydroxyphenol O-methyltransfera... NaN \n", + "6586 cytosylglucuronate decarboxylase activity NaN \n", "\n", - "[6587 rows x 5 columns]" + "[6587 rows x 4 columns]" ] }, - "execution_count": 2, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -383,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -422,7 +427,6 @@ "6587 rows × 6 columns
\n", + "6587 rows × 5 columns
\n", "" ], "text/plain": [ @@ -545,18 +538,18 @@ "6585 GO:1990888 \"Catalysis of the reaction: 2-polyprenyl-6-hyd... \n", "6586 GO:1990965 \"Catalysis of the reaction: cytosylglucuronic ... \n", "\n", - " Name RHEA ID KEGG ID \\\n", - "0 trans-hexaprenyltranstransferase activity 20836 NaN \n", - "1 lactase activity 10076 NaN \n", - "2 adenine deaminase activity 23688 NaN \n", - "3 peptidyltransferase activity NaN NaN \n", - "4 succinate dehydrogenase activity 16357 NaN \n", - "... ... ... ... \n", - "6582 clathrin-uncoating ATPase activity NaN NaN \n", - "6583 rRNA cytidine N-acetyltransferase activity NaN NaN \n", - "6584 2-polyprenyl-3-methyl-5-hydroxy-6-methoxy-1,4-... NaN NaN \n", - "6585 2-polyprenyl-6-hydroxyphenol O-methyltransfera... NaN NaN \n", - "6586 cytosylglucuronate decarboxylase activity NaN NaN \n", + " Name RHEA ID \\\n", + "0 trans-hexaprenyltranstransferase activity 20836 \n", + "1 lactase activity 10076 \n", + "2 adenine deaminase activity 23688 \n", + "3 peptidyltransferase activity NaN \n", + "4 succinate dehydrogenase activity 16357 \n", + "... ... ... \n", + "6582 clathrin-uncoating ATPase activity NaN \n", + "6583 rRNA cytidine N-acetyltransferase activity NaN \n", + "6584 2-polyprenyl-3-methyl-5-hydroxy-6-methoxy-1,4-... NaN \n", + "6585 2-polyprenyl-6-hydroxyphenol O-methyltransfera... NaN \n", + "6586 cytosylglucuronate decarboxylase activity NaN \n", "\n", " substrates \n", "0 [all-trans-hexaprenyl diphosphate, isopentenyl... \n", @@ -571,10 +564,10 @@ "6585 [2-polyprenyl-6-hydroxyphenol, S-adenosyl-L-me... \n", "6586 [cytosylglucuronic acid, H(+) ] \n", "\n", - "[6587 rows x 6 columns]" + "[6587 rows x 5 columns]" ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -605,7 +598,116 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m54200 rows × 3 columns
\n", + "2407 rows × 1 columns
\n", "" ], "text/plain": [ - " RHEA ID CHEBI IDs CHEBI_ID_list\n", - "0 10000.0 NaN [CHEBI:15377, CHEBI:16459]\n", - "1 10001.0 NaN [CHEBI:15377, CHEBI:16459]\n", - "2 10002.0 NaN [CHEBI:28938, CHEBI:31011]\n", - "3 10003.0 NaN [CHEBI:15377, CHEBI:16459]\n", - "4 10004.0 NaN [CHEBI:17484]\n", - "... ... ... ...\n", - "54195 66623.0 NaN [CHEBI:71550, CHEBI:15379, CHEBI:57618]\n", - "54196 66624.0 NaN [CHEBI:16175, CHEBI:15379, CHEBI:57618]\n", - "54197 66625.0 NaN [CHEBI:16175, CHEBI:15379, CHEBI:57618]\n", - "54198 66626.0 NaN [CHEBI:71541, CHEBI:15378, CHEBI:15377, CHEBI:...\n", - "54199 66627.0 NaN [CHEBI:16175, CHEBI:15379, CHEBI:57618]\n", + " metabolites\n", + "0 xanthan \n", + "1 indan-1-ol\n", + "2 oxidosqualene \n", + "3 galactogen\n", + "4 (histone)-arginine \n", + "... ...\n", + "2402 GDP-mannose = mannan(n+1)\n", + "2403 a 3-oxo-dodecanoyl-[acp] \n", + "2404 coenzyme A or its derivatives\n", + "2405 DNA with alkylated base\n", + "2406 2,4-dihydroxy-hept-trans-2-ene-1,7-dioate \n", "\n", - "[54200 rows x 3 columns]" + "[2407 rows x 1 columns]" ] }, - "execution_count": 5, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_RHEA = pd.DataFrame(columns = [\"RHEA ID\", \"CHEBI IDs\", \"CHEBI_ID_list\"])\n", - "\n", - "file1 = open(join(CURRENT_DIR, \"..\" ,\"data\", \"reaction_data\", \"rhea-reactions.txt\"), 'r')\n", - "Lines = file1.readlines()\n", - "\n", - "while True:\n", - " try:\n", - " end = Lines.index('///\\n')\n", - " entry = Lines[:end]\n", - " RHEA_ID, CHEBI_IDs = extract_RHEA_ID_and_CHEBI_IDs(entry)\n", - " CHEBI_ID_list = get_substrate_IDs(IDs = CHEBI_IDs)\n", - " Lines = Lines[end+1:]\n", - " df_RHEA = df_RHEA.append({\"RHEA ID\" : RHEA_ID, \"CHEBI_ID_list\" : CHEBI_ID_list}, ignore_index = True)\n", - " except ValueError:\n", - " break\n", - " \n", - "df_RHEA[\"RHEA ID\"] = [float(ID.split(\":\")[-1]) for ID in df_RHEA[\"RHEA ID\"]]\n", - "df_RHEA.to_pickle(join(CURRENT_DIR, \"..\" ,\"data\", \"reaction_data\", \"RHEA_reaction_df.pkl\"))\n", - "df_RHEA" + "metabolites = []\n", + "for ind in df_substrates.index:\n", + " if pd.isnull(df_substrates[\"molecule ID\"][ind]):\n", + " metabolites = metabolites + [df_substrates[\"molecule\"][ind]]\n", + " \n", + "df_unmapped = pd.DataFrame(data = {\"metabolites\" : list(set(metabolites))})\n", + "df_unmapped" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### (b)(ii) Mapping CHEBI IDs to df_catalytic" + "#### (c)(i) Mapping metabolite names to KEGG compound synonym database:" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " | GO ID | \n", - "Definition | \n", - "Name | \n", - "RHEA ID | \n", - "KEGG ID | \n", - "substrates | \n", - "CHEBI IDs | \n", - "CHEBI_ID_list | \n", - "
---|---|---|---|---|---|---|---|---|
0 | \n", - "GO:0000010 | \n", - "\"Catalysis of the reaction: all-trans-hexapren... | \n", - "trans-hexaprenyltranstransferase activity | \n", - "20836.0 | \n", - "NaN | \n", - "[all-trans-hexaprenyl diphosphate, isopentenyl... | \n", - "NaN | \n", - "[CHEBI:58179, CHEBI:128769] | \n", - "
1 | \n", - "GO:0000016 | \n", - "\"Catalysis of the reaction: lactose + H2O = D-... | \n", - "lactase activity | \n", - "10076.0 | \n", - "NaN | \n", - "[lactose, H2O ] | \n", - "NaN | \n", - "[CHEBI:15377, CHEBI:17716] | \n", - "
2 | \n", - "GO:0000034 | \n", - "\"Catalysis of the reaction: adenine + H2O = hy... | \n", - "adenine deaminase activity | \n", - "23688.0 | \n", - "NaN | \n", - "[adenine, H2O ] | \n", - "NaN | \n", - "[CHEBI:16708, CHEBI:15378, CHEBI:15377] | \n", - "
3 | \n", - "GO:0000048 | \n", - "\"Catalysis of the reaction: peptidyl-tRNA(1) +... | \n", - "peptidyltransferase activity | \n", - "NaN | \n", - "NaN | \n", - "[peptidyl-tRNA(1), aminoacyl-tRNA(2) ] | \n", - "NaN | \n", - "NaN | \n", - "
4 | \n", - "GO:0000104 | \n", - "\"Catalysis of the reaction: succinate + accept... | \n", - "succinate dehydrogenase activity | \n", - "16357.0 | \n", - "NaN | \n", - "[succinate, acceptor ] | \n", - "NaN | \n", - "[CHEBI:13193, CHEBI:30031] | \n", - "
\n", - " | GO ID | \n", - "molecule | \n", - "molecule ID | \n", - "
---|---|---|---|
0 | \n", - "GO:0000010 | \n", - "NaN | \n", - "CHEBI:58179 | \n", - "
1 | \n", - "GO:0000010 | \n", - "NaN | \n", - "CHEBI:128769 | \n", - "
2 | \n", - "GO:0000016 | \n", - "NaN | \n", - "CHEBI:15377 | \n", - "
3 | \n", - "GO:0000016 | \n", - "NaN | \n", - "CHEBI:17716 | \n", - "
4 | \n", - "GO:0000034 | \n", - "NaN | \n", - "CHEBI:16708 | \n", - "
... | \n", - "... | \n", - "... | \n", - "... | \n", - "
14649 | \n", - "GO:1990887 | \n", - "S-adenosyl-L-methionine | \n", - "NaN | \n", - "
14650 | \n", - "GO:1990888 | \n", - "2-polyprenyl-6-hydroxyphenol | \n", - "NaN | \n", - "
14651 | \n", - "GO:1990888 | \n", - "S-adenosyl-L-methionine | \n", - "NaN | \n", - "
14652 | \n", - "GO:1990965 | \n", - "cytosylglucuronic acid | \n", - "NaN | \n", - "
14653 | \n", - "GO:1990965 | \n", - "H(+) | \n", - "NaN | \n", - "
14654 rows × 3 columns
\n", - "\n", - " | metabolites | \n", - "
---|---|
0 | \n", - "xanthan | \n", - "
1 | \n", - "indan-1-ol | \n", - "
2 | \n", - "oxidosqualene | \n", - "
3 | \n", - "galactogen | \n", - "
4 | \n", - "(histone)-arginine | \n", - "
... | \n", - "... | \n", - "
2402 | \n", - "GDP-mannose = mannan(n+1) | \n", - "
2403 | \n", - "a 3-oxo-dodecanoyl-[acp] | \n", - "
2404 | \n", - "coenzyme A or its derivatives | \n", - "
2405 | \n", - "DNA with alkylated base | \n", - "
2406 | \n", - "2,4-dihydroxy-hept-trans-2-ene-1,7-dioate | \n", - "
2407 rows × 1 columns
\n", - "462406 rows × 3 columns
\n", + "476866 rows × 4 columns
\n", "" ], "text/plain": [ - " Uniprot ID molecule ID evidence\n", - "0 A0A009IHW8 CHEBI:15377 exp\n", - "1 A0A009IHW8 CHEBI:57540 exp\n", - "2 A0A022PMU5 C00030 phylo\n", - "3 A0A022PN36 CHEBI:57318 phylo\n", - "4 A0A022PN36 CHEBI:57540 phylo\n", - "... ... ... ...\n", - "495070 Z4YNJ9 CHEBI:15378 phylo\n", - "495071 Z4YNJ9 CHEBI:15379 phylo\n", - "495072 Z4YNJ9 CHEBI:57394 phylo\n", - "495073 Z4YNJ9 C00154 phylo\n", - "495074 Z4YNJ9 C00030 phylo\n", + " Uniprot ID molecule ID evidence RHEA ID\n", + "0 A0A009IHW8 CHEBI:15377 exp 16301\n", + "1 A0A009IHW8 CHEBI:57540 exp 16301\n", + "2 A0A022PMU5 C00030 phylo NaN\n", + "3 A0A022PN36 CHEBI:57318 phylo 22432\n", + "4 A0A022PN36 CHEBI:57540 phylo 22432\n", + "... ... ... ... ...\n", + "495070 Z4YNJ9 CHEBI:15378 phylo 19721\n", + "495071 Z4YNJ9 CHEBI:15379 phylo 19721\n", + "495072 Z4YNJ9 CHEBI:57394 phylo 19721\n", + "495073 Z4YNJ9 C00154 phylo NaN\n", + "495074 Z4YNJ9 C00030 phylo NaN\n", "\n", - "[462406 rows x 3 columns]" + "[476866 rows x 4 columns]" ] }, - "execution_count": 17, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_UID_MID = pd.DataFrame(columns =[\"Uniprot ID\", \"molecule ID\", \"evidence\"])\n", + "df_UID_MID = pd.DataFrame(columns =[\"Uniprot ID\", \"molecule ID\", \"evidence\", \"RHEA ID\"])\n", "\n", "for ind in df_GO_UID.index:\n", " if ind >= -1:\n", " GO_ID = df_GO_UID[\"GO Term\"][ind]\n", + " try:\n", + " RHEA_ID = list(df_catalytic[\"RHEA ID\"].loc[df_catalytic[\"GO ID\"] == GO_ID])[0]\n", + " except:\n", + " RHEA_ID = np.nan\n", + " print(GO_ID)\n", " UID = df_GO_UID[\"Uniprot ID\"][ind]\n", " evidence = df_GO_UID[\"evidence\"][ind]\n", " met_IDs = list(df_substrates[\"molecule ID\"].loc[df_substrates[\"GO ID\"] == GO_ID])\n", " for met_ID in met_IDs:\n", " df_UID_MID = df_UID_MID.append({\"Uniprot ID\" : UID, \"molecule ID\" : met_ID,\n", - " \"evidence\": evidence}, ignore_index = True)\n", + " \"evidence\": evidence, \"RHEA ID\" : RHEA_ID}, ignore_index = True)\n", " if ind % 1000 ==1:\n", " print(ind)\n", " \n", @@ -2463,7 +2150,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -2490,6 +2177,7 @@ "28112 rows × 3 columns
\n", + "29603 rows × 4 columns
\n", "" ], "text/plain": [ - " Uniprot ID molecule ID evidence\n", - "0 A0A009IHW8 CHEBI:15377 exp\n", - "1 A0A009IHW8 CHEBI:57540 exp\n", - "10167 A0A059TC02 CHEBI:16731 exp\n", - "10168 A0A059TC02 CHEBI:57287 exp\n", - "10169 A0A059TC02 CHEBI:58349 exp\n", - "... ... ... ...\n", - "495053 X5KCU3 C00535 exp\n", - "495054 X5KCU9 C00535 exp\n", - "495055 X5KJC0 C00535 exp\n", - "495056 X5L1L5 C00535 exp\n", - "495057 X5L565 C00535 exp\n", + " Uniprot ID molecule ID evidence RHEA ID\n", + "0 A0A009IHW8 CHEBI:15377 exp 16301\n", + "1 A0A009IHW8 CHEBI:57540 exp 16301\n", + "10167 A0A059TC02 CHEBI:16731 exp 10620\n", + "10168 A0A059TC02 CHEBI:57287 exp 10620\n", + "10169 A0A059TC02 CHEBI:58349 exp 10620\n", + "... ... ... ... ...\n", + "495053 X5KCU3 C00535 exp NaN\n", + "495054 X5KCU9 C00535 exp NaN\n", + "495055 X5KJC0 C00535 exp NaN\n", + "495056 X5L1L5 C00535 exp NaN\n", + "495057 X5L565 C00535 exp NaN\n", "\n", - "[28112 rows x 3 columns]" + "[29603 rows x 4 columns]" ] }, - "execution_count": 18, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -2590,6 +2289,26 @@ "df_UID_MID.loc[df_UID_MID[\"evidence\"] == \"exp\"]" ] }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(476866, 29603)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_UID_MID), len(df_UID_MID.loc[df_UID_MID[\"evidence\"] == \"exp\"])" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2599,7 +2318,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -2626,6 +2345,7 @@ "369461 rows × 3 columns
\n", + "378482 rows × 4 columns
\n", "" ], "text/plain": [ - " Uniprot ID molecule ID evidence\n", - "1 A0A009IHW8 CHEBI:57540 exp\n", - "2 A0A022PMU5 C00030 phylo\n", - "3 A0A022PN36 CHEBI:57318 phylo\n", - "4 A0A022PN36 CHEBI:57540 phylo\n", - "6 A0A022PN36 CHEBI:16469 phylo\n", - "... ... ... ...\n", - "495066 Z4YHZ5 CHEBI:16810 phylo\n", - "495067 Z4YHZ5 CHEBI:50342 phylo\n", - "495072 Z4YNJ9 CHEBI:57394 phylo\n", - "495073 Z4YNJ9 C00154 phylo\n", - "495074 Z4YNJ9 C00030 phylo\n", + " Uniprot ID molecule ID evidence RHEA ID\n", + "1 A0A009IHW8 CHEBI:57540 exp 16301\n", + "2 A0A022PMU5 C00030 phylo NaN\n", + "3 A0A022PN36 CHEBI:57318 phylo 22432\n", + "4 A0A022PN36 CHEBI:57540 phylo 22432\n", + "5 A0A022PN36 CHEBI:57318 phylo 16105\n", + "... ... ... ... ...\n", + "495066 Z4YHZ5 CHEBI:16810 phylo 18945\n", + "495067 Z4YHZ5 CHEBI:50342 phylo 18945\n", + "495072 Z4YNJ9 CHEBI:57394 phylo 19721\n", + "495073 Z4YNJ9 C00154 phylo NaN\n", + "495074 Z4YNJ9 C00030 phylo NaN\n", "\n", - "[369461 rows x 3 columns]" + "[378482 rows x 4 columns]" ] }, - "execution_count": 19, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -2736,6 +2467,134 @@ "df_UID_MID" ] }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(378482, 23384)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_UID_MID), len(df_UID_MID.loc[df_UID_MID[\"evidence\"] == \"exp\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6219" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "29603-23384" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19270" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_UID_MID.loc[df_UID_MID[\"evidence\"] == \"exp\"].loc[~pd.isnull(df_UID_MID[\"RHEA ID\"])])" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "df_UID_MID_train = pd.read_pickle(join(CURRENT_DIR, \"..\" ,\"data\",\"enzyme_substrate_data\", \"df_UID_MID_train.pkl\" ))\n", + "df_UID_MID_test = pd.read_pickle(join(CURRENT_DIR, \"..\" ,\"data\", \"enzyme_substrate_data\", \"df_UID_MID_test.pkl\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "15051" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_UID_MID2 = df_UID_MID.loc[df_UID_MID[\"evidence\"] == \"exp\"]\n", + "df = df_UID_MID_train.loc[df_UID_MID_train[\"evidence\"] == \"exp\"]\n", + "df.drop(columns = \"evidence\", inplace = True)\n", + "df_train = df.merge(df_UID_MID2, on = [\"Uniprot ID\", \"molecule ID\"], how = \"left\")\n", + "\n", + "df = df_UID_MID_test.loc[df_UID_MID_test[\"evidence\"] == \"exp\"]\n", + "df.drop(columns = \"evidence\", inplace = True)\n", + "df_test = df.merge(df_UID_MID2, on = [\"Uniprot ID\", \"molecule ID\"], how = \"left\")\n", + "\n", + "len(df_train.loc[~pd.isnull(df_train[\"RHEA ID\"])]) +len(df_test.loc[~pd.isnull(df_test[\"RHEA ID\"])])" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200634" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_UID_MID2 = df_UID_MID.loc[df_UID_MID[\"evidence\"] == \"phylo\"]\n", + "df = df_UID_MID_train.loc[df_UID_MID_train[\"evidence\"] == \"phylo\"]\n", + "df.drop(columns = \"evidence\", inplace = True)\n", + "df_train = df.merge(df_UID_MID2, on = [\"Uniprot ID\", \"molecule ID\"], how = \"left\")\n", + "\n", + "df = df_UID_MID_test.loc[df_UID_MID_test[\"evidence\"] == \"phylo\"]\n", + "df.drop(columns = \"evidence\", inplace = True)\n", + "df_test = df.merge(df_UID_MID2, on = [\"Uniprot ID\", \"molecule ID\"], how = \"left\")\n", + "\n", + "len(df_train.loc[~pd.isnull(df_train[\"RHEA ID\"])]) +len(df_test.loc[~pd.isnull(df_test[\"RHEA ID\"])])" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2897,7 +2756,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -2950,232 +2809,59 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "# cluster the fasta files\n", - "cluster_folder = join(CURRENT_DIR, \"..\" ,\"data\", \"enzyme_data\", 'clusters')\n", - "start_folder = cluster_folder\n", - "cluster_all_levels(start_folder, \n", - " cluster_folder, \n", - " filename='all_sequences')" - ] - }, - { - "cell_type": "code", - "execution_count": 29, + "execution_count": 53, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "\n", - " | cluster | \n", - "
---|---|
count | \n", - "230802.000000 | \n", - "
mean | \n", - "61688.071425 | \n", - "
std | \n", - "37173.511476 | \n", - "
min | \n", - "0.000000 | \n", - "
25% | \n", - "29925.250000 | \n", - "
50% | \n", - "59270.000000 | \n", - "
75% | \n", - "92366.000000 | \n", - "
max | \n", - "133172.000000 | \n", - "
\n", - " | cluster | \n", - "member | \n", - "
---|---|---|
0 | \n", - "0 | \n", - "59618 | \n", - "
1 | \n", - "1 | \n", - "17477 | \n", - "
2 | \n", - "2 | \n", - "193350 | \n", - "
3 | \n", - "3 | \n", - "202039 | \n", - "
4 | \n", - "3 | \n", - "31895 | \n", - "
\n", - " | cluster | \n", - "member | \n", - "
---|---|---|
230797 | \n", - "133168 | \n", - "102904 | \n", - "
230798 | \n", - "133169 | \n", - "168585 | \n", - "
230799 | \n", - "133170 | \n", - "39204 | \n", - "
230800 | \n", - "133171 | \n", - "42158 | \n", - "
230801 | \n", - "133172 | \n", - "99620 | \n", - "
55418 rows × 6 columns
\n", + "57651 rows × 7 columns
\n", "" ], "text/plain": [ - " Uniprot ID molecule ID evidence \\\n", - "0 Q5B2F7 CHEBI:57344 exp \n", - "1 Q9SAH9 CHEBI:58349 exp \n", - "2 Q8IPJ6 CHEBI:57776 exp \n", - "3 A0A1D5PCZ1 C00002 exp \n", - "4 O22765 CHEBI:33384 exp \n", - "... ... ... ... \n", - "55413 P43123 CHEBI:30616 NaN \n", - "55414 Q8RVK9 C00002 NaN \n", - "55415 Q8RVK9 CHEBI:30616 NaN \n", - "55416 Q62730 CHEBI:30616 NaN \n", - "55417 Q62730 C00002 NaN \n", + " Uniprot ID molecule ID evidence RHEA ID \\\n", + "0 P51567 C00002 exp NaN \n", + "1 P08539 CHEBI:37565 exp 19669 \n", + "2 Q84WW2 CHEBI:57955 exp 12556 \n", + "3 P33279 C00002 exp NaN \n", + "4 Q8JGL9 CHEBI:456216 exp 18159 \n", + "... ... ... ... ... \n", + "57646 P09884 C00002 NaN NaN \n", + "57647 P09884 CHEBI:30616 NaN NaN \n", + "57648 P39177 CHEBI:29919 NaN NaN \n", + "57649 P39177 C00001 NaN NaN \n", + "57650 P39177 CHEBI:16240 NaN NaN \n", "\n", " ECFP Binding type \n", - "0 0100000001000000000000000000000001000000000000... 1 NaN \n", - "1 0000000001000000100000100000000000000000000000... 1 NaN \n", - "2 0000000000000000000000000000010001000000000000... 1 NaN \n", + "0 0000000001000000000000000000000000000000000000... 1 NaN \n", + "1 0000001000000000000000000000000000000000000000... 1 NaN \n", + "2 0000000000000100000000000000000000000000000000... 1 NaN \n", "3 0000000001000000000000000000000000000000000000... 1 NaN \n", - "4 0100000000000000000000000000000000000000000000... 1 NaN \n", + "4 0000000001000000000000000000000000000000000000... 1 NaN \n", "... ... ... ... \n", - "55413 NaN 0 NaN \n", - "55414 NaN 0 NaN \n", - "55415 NaN 0 NaN \n", - "55416 NaN 0 NaN \n", - "55417 NaN 0 NaN \n", + "57646 NaN 0 NaN \n", + "57647 NaN 0 NaN \n", + "57648 NaN 0 NaN \n", + "57649 NaN 0 NaN \n", + "57650 NaN 0 NaN \n", "\n", - "[55418 rows x 6 columns]" + "[57651 rows x 7 columns]" ] }, - "execution_count": 95, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } @@ -5144,6 +4883,26 @@ "df_UID_MID_train_exp" ] }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.18952277759457295" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(count)/len(df_UID_MID_train_exp.loc[df_UID_MID_train_exp[\"Binding\"] == 0])" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -12274,27 +12033,241 @@ ] }, { - "cell_type": "markdown", + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Adding task-specific enzyme representations (extra token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (a) Creating input data for training the task-specific ESM1b vectors:" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "df_UID_MID_train = pd.read_pickle(join(CURRENT_DIR, \"..\" ,\"data\",\"enzyme_substrate_data\", \"df_UID_MID_train.pkl\" ))\n", + "df_UID_MID_test = pd.read_pickle(join(CURRENT_DIR, \"..\" ,\"data\", \"enzyme_substrate_data\", \"df_UID_MID_test.pkl\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | Uniprot ID | \n", + "molecule ID | \n", + "evidence | \n", + "ECFP | \n", + "
---|---|---|---|---|
18 | \n", + "Q921A4 | \n", + "C00030 | \n", + "exp | \n", + "0000000000000000010000000000000000000000000000... | \n", + "
22 | \n", + "Q5B2F7 | \n", + "CHEBI:57344 | \n", + "exp | \n", + "0100000001000000000000000000000001000000000000... | \n", + "
59 | \n", + "Q9SAH9 | \n", + "CHEBI:58349 | \n", + "exp | \n", + "0000000001000000100000100000000000000000000000... | \n", + "
135 | \n", + "Q8IPJ6 | \n", + "CHEBI:57776 | \n", + "exp | \n", + "0000000000000000000000000000010001000000000000... | \n", + "
160 | \n", + "A0A1D5PCZ1 | \n", + "C00002 | \n", + "exp | \n", + "0000000001000000000000000000000000000000000000... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
289967 | \n", + "P04152 | \n", + "C00677 | \n", + "exp | \n", + "0000000000000000001000000000000000000000000000... | \n", + "
289976 | \n", + "Q4Q1I5 | \n", + "CHEBI:16810 | \n", + "exp | \n", + "0000000000000000000000000000000010000000000000... | \n", + "
289995 | \n", + "P43123 | \n", + "CHEBI:46398 | \n", + "exp | \n", + "0000000000000000000000000000000000000000000000... | \n", + "
290031 | \n", + "Q8RVK9 | \n", + "CHEBI:57384 | \n", + "exp | \n", + "0100000001000000000000000000000001000000000000... | \n", + "
290040 | \n", + "Q62730 | \n", + "CHEBI:57540 | \n", + "exp | \n", + "0000000001000000000000100000000000000000000000... | \n", + "
14778 rows × 4 columns
\n", + "\n", + " | Uniprot ID | \n", + "Sequence | \n", + "ESM1b_ts_mean | \n", + "
---|---|---|---|
0 | \n", + "G3QP07 | \n", + "MAAGAGTAGLASGPGVVRDPAASQPRKRPGREGGEGARRSDTMAGG... | \n", + "[-0.72636825, 0.52454716, -0.5797731, -0.24976... | \n", + "
1 | \n", + "Q9KNU3 | \n", + "MRNTMFTSKEGQTIPQVTFPTRQGDAWVNVTSDELFKGKTVIVFSL... | \n", + "[-0.25921354, 0.2036142, -0.23848991, -0.14593... | \n", + "
2 | \n", + "H3HBD2 | \n", + "MYPSVALLLWLWGAGVALQVIGLLGCLVEQNDTSDCASTARDRRTS... | \n", + "[-0.8491672, 0.34376088, -1.1393323, 0.3514062... | \n", + "
3 | \n", + "A0A3B6C0N2 | \n", + "MEYHRVVSLVAVVVVLLRRWPALSSAQAPVSRTITVDSHGGGDFSS... | \n", + "[-0.27494934, 0.5831903, -0.47037718, -0.17670... | \n", + "
4 | \n", + "A0A1Z5RRB7 | \n", + "MDMPSHTHSQLCESKALVASYTQEARKRNQQHNMASKPGPLSRWPW... | \n", + "[0.13033693, -0.873903, -0.16711213, -0.122451... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
230809 | \n", + "A0A072VJI1 | \n", + "MELSAVTLGGKGSSLSSSAVYATAIGKSQIKIDSSALDRLTSPPSS... | \n", + "[0.46406618, 0.14648609, -0.34433103, 0.074277... | \n", + "
230810 | \n", + "D7SVZ9 | \n", + "MFIESFKVESPNVKYTENEIHSVYDYETTELVHENRNGTYQWVVKP... | \n", + "[0.06923403, -0.5674761, -0.27717096, 0.625875... | \n", + "
230811 | \n", + "A0A2C9WL07 | \n", + "MAPISILLFSSILLFSASSTGRALSFNYYEKTCPDVELIVTNAVKN... | \n", + "[-0.64768815, 0.81601745, 0.12785931, -0.55637... | \n", + "
230812 | \n", + "F4ICB6 | \n", + "MGCVNSKQTVSVTPAIDHSGVFRDNVCSGSGRIVVEDLPPVTETKL... | \n", + "[0.054608203, 1.0352598, 0.7369152, -1.6918645... | \n", + "
230813 | \n", + "B5LAU7 | \n", + "MAAAATCAFFPTGNPPSDSGAKSSGNLGGGSVPGSIDARGLNNVKK... | \n", + "[0.2115751, 0.41601294, -0.66078496, 0.3504067... | \n", + "
230814 rows × 3 columns
\n", + "\n", + " | Uniprot ID | \n", + "molecule ID | \n", + "evidence | \n", + "ECFP | \n", + "Binding | \n", + "type | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "Q5B2F7 | \n", + "CHEBI:57344 | \n", + "exp | \n", + "0100000001000000000000000000000001000000000000... | \n", + "1 | \n", + "NaN | \n", + "
1 | \n", + "Q9SAH9 | \n", + "CHEBI:58349 | \n", + "exp | \n", + "0000000001000000100000100000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
2 | \n", + "Q8IPJ6 | \n", + "CHEBI:57776 | \n", + "exp | \n", + "0000000000000000000000000000010001000000000000... | \n", + "1 | \n", + "NaN | \n", + "
3 | \n", + "A0A1D5PCZ1 | \n", + "C00002 | \n", + "exp | \n", + "0000000001000000000000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
4 | \n", + "O22765 | \n", + "CHEBI:33384 | \n", + "exp | \n", + "0100000000000000000000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
29207 | \n", + "P04152 | \n", + "CHEBI:15901 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
29208 | \n", + "Q4Q1I5 | \n", + "C00007 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
29209 | \n", + "P43123 | \n", + "CHEBI:30616 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
29210 | \n", + "Q8RVK9 | \n", + "C00002 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
29211 | \n", + "Q62730 | \n", + "CHEBI:30616 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
29212 rows × 6 columns
\n", + "\n", + " | Uniprot ID | \n", + "molecule ID | \n", + "evidence | \n", + "ECFP | \n", + "Binding | \n", + "type | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "A8XT89 | \n", + "CHEBI:58885 | \n", + "phylo | \n", + "0000000000000100000000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
1 | \n", + "B2GV06 | \n", + "CHEBI:57292 | \n", + "phylo | \n", + "0100100001000000000000000000000011000000000000... | \n", + "1 | \n", + "NaN | \n", + "
2 | \n", + "A0A022RBJ3 | \n", + "CHEBI:33227 | \n", + "phylo | \n", + "1000000000000000000000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
3 | \n", + "G3S168 | \n", + "CHEBI:59776 | \n", + "phylo | \n", + "0100000000000000000000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
4 | \n", + "F6I0H0 | \n", + "C00002 | \n", + "phylo | \n", + "0000000001000000000000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
424181 | \n", + "A0A0A0LAF9 | \n", + "C00002 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
424182 | \n", + "A9V8I9 | \n", + "C00002 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
424183 | \n", + "A0A2J6JMI2 | \n", + "C00002 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
424184 | \n", + "Q8Y7G6 | \n", + "C00002 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
424185 | \n", + "B9R6X0 | \n", + "C00002 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
424186 rows × 6 columns
\n", + "\n", + " | Uniprot ID | \n", + "molecule ID | \n", + "evidence | \n", + "ECFP | \n", + "Binding | \n", + "type | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "P71828 | \n", + "CHEBI:57925 | \n", + "exp | \n", + "0100000001000000000000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
1 | \n", + "A0A1D8PGI8 | \n", + "CHEBI:16897 | \n", + "exp | \n", + "0100000000000001000000000000000000000000000100... | \n", + "1 | \n", + "NaN | \n", + "
2 | \n", + "Q8NEZ4 | \n", + "C00019 | \n", + "exp | \n", + "0100100001000000000000000000000001000000000000... | \n", + "1 | \n", + "NaN | \n", + "
3 | \n", + "F4K5T2 | \n", + "CHEBI:35235 | \n", + "exp | \n", + "0100000000000000000000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
4 | \n", + "Q05762 | \n", + "CHEBI:57453 | \n", + "exp | \n", + "0110000000000000001000000000000000000000000000... | \n", + "1 | \n", + "NaN | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
7009 | \n", + "P53739 | \n", + "CHEBI:57618 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
7010 | \n", + "H9D1R1 | \n", + "CHEBI:71682 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
7011 | \n", + "P00962 | \n", + "CHEBI:58048 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
7012 | \n", + "P48163 | \n", + "C00002 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
7013 | \n", + "Q3TIG7 | \n", + "CHEBI:57618 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "NaN | \n", + "
7014 rows × 6 columns
\n", + "\n", + " | molecule ID | \n", + "ESM1b | \n", + "ECFP | \n", + "RDKit FP | \n", + "MACCS FP | \n", + "PMID | \n", + "MW | \n", + "LogP | \n", + "log10_KM | \n", + "checked | \n", + "GNN FP | \n", + "Uniprot ID | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "C00387 | \n", + "[0.100948475, 0.23829113, 0.0027401948, 0.0371... | \n", + "0000000000000000000000000000000000000000010000... | \n", + "1010111010101011101111011100011011000100100110... | \n", + "0000000000000000000000000100000000000010000100... | \n", + "17918964.0 | \n", + "283.091669 | \n", + "-2.6867 | \n", + "-0.728158 | \n", + "True | \n", + "[13.362184, 70.41528, 2.7784162, 60.74657, 0.0... | \n", + "Enzyme:train:0 | \n", + "
1 | \n", + "C00143 | \n", + "[-0.09477718, 0.16472308, 0.09403025, 0.007433... | \n", + "0100000000000000000000000000000000000000000000... | \n", + "1111111000110111101101111011100011001011011110... | \n", + "0000000000000000000000000100100000000010000100... | \n", + "21858212.0 | \n", + "457.170981 | \n", + "-0.5219 | \n", + "-0.744727 | \n", + "True | \n", + "[18.767265, 151.67131, 25.194078, 66.9493, 0.6... | \n", + "Enzyme:train:1 | \n", + "
2 | \n", + "C00756 | \n", + "[0.12043195, 0.17901447, -0.003300894, 0.07185... | \n", + "0000000000000000000000000000000001000000000000... | \n", + "0000000000000000000000000000000000000000000000... | \n", + "0000000000000000000000000000000000000000000000... | \n", + "19383697.0 | \n", + "130.135765 | \n", + "2.3392 | \n", + "0.588832 | \n", + "True | \n", + "[0.053105697, 23.302288, 3.7088723, 5.9439626,... | \n", + "Enzyme:train:2 | \n", + "
3 | \n", + "C00002 | \n", + "[0.068544716, 0.23684321, 0.080181114, -0.0251... | \n", + "0000000001000000000000000000000000000000000000... | \n", + "1010111010101011101011111000111010011100100111... | \n", + "0000000000000000000000000000010000000010000100... | \n", + "19509290.0 | \n", + "506.995745 | \n", + "-1.6290 | \n", + "-0.709965 | \n", + "True | \n", + "[15.331518, 103.84776, 6.569991, 63.609444, 0.... | \n", + "Enzyme:train:3 | \n", + "
4 | \n", + "C00083 | \n", + "[-0.062576994, 0.30821875, 0.101220384, -0.011... | \n", + "0100000001000100000000000000000001000000010000... | \n", + "1010111010101011101011111011111010011100111111... | \n", + "0000000000000000000000000000010000000010000100... | \n", + "17292360.0 | \n", + "853.115603 | \n", + "-1.8606 | \n", + "-2.246545 | \n", + "True | \n", + "[19.037132, 187.85568, 16.434797, 90.37692, 1.... | \n", + "Enzyme:train:4 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
7575 | \n", + "C20925 | \n", + "[-0.12106511, 0.16286044, -0.05657043, 0.00162... | \n", + "0100000000000010000000000000000001000000010000... | \n", + "0000000000000011100011000011000011000000001000... | \n", + "0000000000000000000000000000000000000000000000... | \n", + "NaN | \n", + "390.175064 | \n", + "-2.1652 | \n", + "-1.000000 | \n", + "False | \n", + "[14.411318, 104.04242, 21.408749, 26.555807, 2... | \n", + "Enzyme:train:7575 | \n", + "
7576 | \n", + "C21181 | \n", + "[-0.009757707, 0.1251226, 0.011750575, -0.0227... | \n", + "0100000000000000000000000000000000000000000000... | \n", + "0000000010000000000000000000000000000000000000... | \n", + "0000000000000000000000000000000000000001100000... | \n", + "NaN | \n", + "153.993594 | \n", + "-1.5660 | \n", + "-0.853872 | \n", + "False | \n", + "\n", + " | Enzyme:train:7576 | \n", + "
7577 | \n", + "C21310 | \n", + "[-0.0037425177, 0.06174834, -0.05052497, 0.063... | \n", + "0000000000000100000000000000000000001000000000... | \n", + "1011111011011111101111111101101011111111111111... | \n", + "0000000000000000000000000100010000000010000100... | \n", + "NaN | \n", + "522.990660 | \n", + "-2.9161 | \n", + "-3.102373 | \n", + "False | \n", + "[12.673787, 120.78082, 6.8376884, 88.71222, 0.... | \n", + "Enzyme:train:7577 | \n", + "
7578 | \n", + "C21563 | \n", + "[0.028141364, 0.16967583, -0.118034706, 0.1133... | \n", + "0100010000000000000000000000000000000000010000... | \n", + "0100000100001010100001000011001110000001001101... | \n", + "0000000000000000000000000000000000000000000000... | \n", + "NaN | \n", + "415.104936 | \n", + "-0.9324 | \n", + "-0.366532 | \n", + "False | \n", + "[10.382019, 105.68732, 18.721575, 34.91598, 2.... | \n", + "Enzyme:train:7578 | \n", + "
7579 | \n", + "C21737 | \n", + "[-0.017929962, 0.2529225, -0.14529729, -0.0213... | \n", + "1000000000000000000000000000000000000000000000... | \n", + "0000000000010000000000000000000000000000000000... | \n", + "0000000000000010000000000000000000000000000000... | \n", + "NaN | \n", + "238.033351 | \n", + "2.0974 | \n", + "-0.221849 | \n", + "False | \n", + "\n", + " | Enzyme:train:7579 | \n", + "
7569 rows × 12 columns
\n", + "29286 rows × 10 columns
\n", + "29128 rows × 9 columns
\n", "" ], "text/plain": [ - " Uniprot ID molecule ID evidence Binding type substrate ID \\\n", - "0 Q5B2F7 CHEBI_57344 exp 1 NaN CHEBI:57344 \n", - "1 Q9SAH9 CHEBI_58349 exp 1 NaN CHEBI:58349 \n", - "2 Q8IPJ6 CHEBI_57776 exp 1 NaN CHEBI:57776 \n", - "3 A0A1D5PCZ1 C00002 exp 1 NaN C00002 \n", - "4 O22765 CHEBI_33384 exp 1 NaN CHEBI:33384 \n", - "... ... ... ... ... ... ... \n", - "29355 O54937 CHEBI_16199 exp 0 NaN CHEBI:16199 \n", - "29356 P42980 CHEBI_43474 exp 0 NaN CHEBI:43474 \n", - "29357 P31254 CHEBI_30616 exp 0 NaN CHEBI:30616 \n", - "29358 C0HLL2 CHEBI_30616 exp 0 NaN CHEBI:30616 \n", - "29359 Q8RVK9 C00002 exp 0 NaN C00002 \n", + " Uniprot ID molecule ID evidence Binding type substrate ID \\\n", + "0 G8BBN0 CHEBI_35681 exp 1.0 NaN CHEBI:35681 \n", + "1 P78937 CHEBI_30616 exp 1.0 NaN CHEBI:30616 \n", + "2 F4K688 CHEBI_30616 exp 1.0 NaN CHEBI:30616 \n", + "3 Q9Z0J5 CHEBI_58349 exp 1.0 NaN CHEBI:58349 \n", + "4 P49189 CHEBI_58264 exp 1.0 NaN CHEBI:58264 \n", + "... ... ... ... ... ... ... \n", + "29475 C9Y9E7 CHEBI_16810 NaN 0.0 engqvist CHEBI:16810 \n", + "29476 C9Y9E7 CHEBI_17544 NaN 0.0 engqvist CHEBI:17544 \n", + "29477 C9Y9E7 C00007 NaN 0.0 engqvist C00007 \n", + "29478 D4MUV9 CHEBI_16810 NaN 0.0 engqvist CHEBI:16810 \n", + "29479 D4MUV9 CHEBI_17478 NaN 0.0 engqvist CHEBI:17478 \n", "\n", " ECFP \\\n", - "0 0100000001000000000000000000000001000000000000... \n", - "1 0000000001000000100000100000000000000000000000... \n", - "2 0000000000000000000000000000010001000000000000... \n", - "3 0000000001000000000000000000000000000000000000... \n", - "4 0100000000000000000000000000000000000000000000... \n", + "0 0100000000000000000000000000000001000000000000... \n", + "1 0000000001000000000000000000000000000000000100... \n", + "2 0000000001000000000000000000000000000000000100... \n", + "3 0000000001000000100000100000000000000000000000... \n", + "4 0000000000000000010000000000000000000000000000... \n", "... ... \n", - "29355 0000000000000000000000000000000000000000000000... \n", - "29356 0000000000000000000000000000000000000000000000... \n", - "29357 0000000001000000000000000000000000000000000100... \n", - "29358 0000000001000000000000000000000000000000000100... \n", - "29359 0000000001000000000000000000000000000000000000... \n", + "29475 0000000000000000000000000000000010000000000000... \n", + "29476 0000000000000000000000000000000000000000000000... \n", + "29477 0000000000000000100000000000000000000000000000... \n", + "29478 0000000000000000000000000000000010000000000000... \n", + "29479 0000000000000000000000000000000000000000000000... \n", "\n", " ESM1b \\\n", - "0 [0.09207666, 0.18022089, 0.1191696, -0.0068351... \n", - "1 [0.022810845, 0.1272514, -0.051154055, -0.0810... \n", - "2 [0.09814875, 0.22172487, 0.11138555, 0.0365497... \n", - "3 [-0.21187752, 0.08564956, 0.055316914, -0.0550... \n", - "4 [0.027133903, 0.33383188, -0.0057643764, -0.00... \n", + "0 [-0.033332635, 0.35044205, -0.07861315, 0.0046... \n", + "1 [0.049317513, 0.11258735, -0.08035447, 0.04825... \n", + "2 [-0.005019231, 0.06971764, -0.022618646, -0.03... \n", + "3 [-0.15290919, 0.31520224, 0.025415594, 0.02750... \n", + "4 [-0.044796597, 0.24305029, 0.10043996, -0.0269... \n", "... ... \n", - "29355 [0.050787933, 0.20482497, -0.0821579, -0.03619... \n", - "29356 [0.03450865, 0.10044937, -0.081294104, 0.03105... \n", - "29357 [-0.10911206, 0.12464452, -0.006680568, 0.1137... \n", - "29358 [0.087619156, 0.30014926, 0.051759467, 0.07981... \n", - "29359 [-0.0870039, 0.34124222, 0.20787948, -0.154150... \n", + "29475 [0.1486952, 0.23952422, -0.18132365, 0.0853893... \n", + "29476 [0.1486952, 0.23952422, -0.18132365, 0.0853893... \n", + "29477 [0.1486952, 0.23952422, -0.18132365, 0.0853893... \n", + "29478 [0.08790772, 0.17450011, -0.014648443, 0.06931... \n", + "29479 [0.08790772, 0.17450011, -0.014648443, 0.06931... \n", "\n", - " ESM1b_ts \\\n", - "0 [-0.52362674, 0.5027057, -0.40282017, 0.742947... \n", - "1 [0.61918294, 0.121414125, 0.40603346, 1.126637... \n", - "2 [0.29864457, 0.22536643, 0.27347004, -0.128196... \n", - "3 [-0.86605054, -0.38922024, -0.539311, 1.373580... \n", - "4 [1.1005167, -1.0289398, -0.061415985, 0.988528... \n", - "... ... \n", - "29355 [-0.6821743, -0.25235456, -0.06566423, 0.84851... \n", - "29356 [0.7604322, -0.6746883, 0.038595006, 0.1019296... \n", - "29357 [0.5049492, 0.23488945, -0.7357721, 0.21344757... \n", - "29358 [1.0081663, -0.47126764, 0.106960185, -0.28055... \n", - "29359 [-0.59816426, 0.41644707, -0.7390129, 1.428337... \n", - "\n", - " GNN rep \n", - "0 [1577.9962, 10.317345, 29.326752, 233.01369, 4... \n", - "1 [2261.6094, 0.0, 0.0, 115.09651, 179.84134, 46... \n", - "2 [791.13226, 7.796671, 0.0, 0.0, 4.66982, 10.69... \n", - "3 [1238.0188, 0.0, 0.0, 42.365837, 74.54658, 28.... \n", - "4 [72.62339, 18.489643, 0.0, 50.355515, 13.49715... \n", + " ESM1b_ts \n", + "0 [0.5721893, 0.56740093, 0.09789569, 0.8466092,... \n", + "1 [-0.56589794, -0.5028634, 0.2953197, -0.357490... \n", + "2 [0.3031646, 0.69172686, -1.0995013, 0.13241063... \n", + "3 [0.118711345, 0.8216332, -0.9046953, 1.179861,... \n", + "4 [0.8842707, -0.06434063, 0.5387947, 1.6151128,... \n", "... ... \n", - "29355 [10.734227, 0.0, 4.6557817, 1.7201436, 0.0, 0.... \n", - "29356 [32.170155, 0.0, 0.0, 0.0, 0.7738515, 47.04823... \n", - "29357 [1288.9618, 0.0, 0.0, 76.52397, 73.09448, 37.6... \n", - "29358 [1288.9618, 0.0, 0.0, 76.52397, 73.09448, 37.6... \n", - "29359 [1238.0188, 0.0, 0.0, 42.365837, 74.54658, 28.... \n", + "29475 [-0.5624707, 0.49068797, -0.78957033, 1.021208... \n", + "29476 [-0.5624707, 0.49068797, -0.78957033, 1.021208... \n", + "29477 [-0.5624707, 0.49068797, -0.78957033, 1.021208... \n", + "29478 [1.0554699, 0.441238, 0.19652943, 1.1101232, -... \n", + "29479 [1.0554699, 0.441238, 0.19652943, 1.1101232, -... \n", "\n", - "[29286 rows x 10 columns]" + "[29128 rows x 9 columns]" ] }, - "execution_count": 34, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -475,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -504,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -527,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -550,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -587,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -600,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -617,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -636,205 +696,224 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "pre_training = True" + ] + }, + { + "cell_type": "code", + "execution_count": 21, "metadata": { "scrolled": true }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\alexk\\anaconda3\\envs\\Predicting_Km\\lib\\site-packages\\torch\\autograd\\__init__.py:127: UserWarning: Mixed memory format inputs detected while calling the operator. The operator will output contiguous tensor even if some of the inputs are in channels_last format. (Triggered internally at ..\\aten\\src\\ATen\\native\\TensorIterator.cpp:918.)\n", + " allow_unreachable=True) # allow_unreachable flag\n", + "C:\\Users\\alexk\\anaconda3\\envs\\Predicting_Km\\lib\\site-packages\\torch\\autograd\\__init__.py:127: UserWarning: Mixed memory format inputs detected while calling the operator. The operator will output channels_last tensor even if some of the inputs are not in channels_last format. (Triggered internally at ..\\aten\\src\\ATen\\native\\TensorIterator.cpp:924.)\n", + " allow_unreachable=True) # allow_unreachable flag\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "[1, 20] loss: 0.699\n", + "[1, 20] loss: 0.743\n", "[1, 40] loss: 0.693\n", - "[1, 60] loss: 0.692\n", - "[1, 80] loss: 0.683\n", - "[1, 100] loss: 0.691\n", - "[1, 120] loss: 0.693\n", - "[1, 140] loss: 0.689\n", - "[1, 160] loss: 0.681\n", - "[1, 180] loss: 0.691\n", - "[1, 200] loss: 0.686\n", - "[1, 220] loss: 0.682\n", - "[1, 240] loss: 0.683\n", - "[1, 260] loss: 0.675\n", - "[1, 280] loss: 0.680\n", - "[1, 300] loss: 0.684\n", + "[1, 60] loss: 0.707\n", + "[1, 80] loss: 0.690\n", + "[1, 100] loss: 0.694\n", + "[1, 120] loss: 0.682\n", + "[1, 140] loss: 0.690\n", + "[1, 160] loss: 0.688\n", + "[1, 180] loss: 0.682\n", + "[1, 200] loss: 0.676\n", + "[1, 220] loss: 0.686\n", + "[1, 240] loss: 0.675\n", + "[1, 260] loss: 0.681\n", + "[1, 280] loss: 0.674\n", + "[1, 300] loss: 0.679\n", "[1, 320] loss: 0.676\n", - "[1, 340] loss: 0.686\n", - "[1, 360] loss: 0.677\n", - "Epoch: 0, Val. loss: 0.67, Val. acc: 0.57\n", - "[2, 20] loss: 0.669\n", - "[2, 40] loss: 0.669\n", - "[2, 60] loss: 0.665\n", - "[2, 80] loss: 0.667\n", - "[2, 100] loss: 0.660\n", - "[2, 120] loss: 0.663\n", - "[2, 140] loss: 0.658\n", - "[2, 160] loss: 0.662\n", - "[2, 180] loss: 0.656\n", - "[2, 200] loss: 0.655\n", - "[2, 220] loss: 0.659\n", - "[2, 240] loss: 0.654\n", - "[2, 260] loss: 0.642\n", - "[2, 280] loss: 0.648\n", - "[2, 300] loss: 0.645\n", - "[2, 320] loss: 0.657\n", - "[2, 340] loss: 0.642\n", - "[2, 360] loss: 0.645\n", - "Epoch: 1, Val. loss: 0.65, Val. acc: 0.62\n", - "[3, 20] loss: 0.647\n", - "[3, 40] loss: 0.634\n", - "[3, 60] loss: 0.619\n", - "[3, 80] loss: 0.631\n", - "[3, 100] loss: 0.626\n", - "[3, 120] loss: 0.616\n", + "[1, 340] loss: 0.673\n", + "[1, 360] loss: 0.682\n", + "Epoch: 0, Val. loss: 0.67, Val. acc: 0.58\n", + "[2, 20] loss: 0.666\n", + "[2, 40] loss: 0.662\n", + "[2, 60] loss: 0.674\n", + "[2, 80] loss: 0.662\n", + "[2, 100] loss: 0.671\n", + "[2, 120] loss: 0.660\n", + "[2, 140] loss: 0.661\n", + "[2, 160] loss: 0.659\n", + "[2, 180] loss: 0.649\n", + "[2, 200] loss: 0.668\n", + "[2, 220] loss: 0.655\n", + "[2, 240] loss: 0.659\n", + "[2, 260] loss: 0.650\n", + "[2, 280] loss: 0.646\n", + "[2, 300] loss: 0.650\n", + "[2, 320] loss: 0.635\n", + "[2, 340] loss: 0.646\n", + "[2, 360] loss: 0.643\n", + "Epoch: 1, Val. loss: 0.64, Val. acc: 0.61\n", + "[3, 20] loss: 0.626\n", + "[3, 40] loss: 0.635\n", + "[3, 60] loss: 0.632\n", + "[3, 80] loss: 0.639\n", + "[3, 100] loss: 0.624\n", + "[3, 120] loss: 0.624\n", "[3, 140] loss: 0.628\n", - "[3, 160] loss: 0.623\n", - "[3, 180] loss: 0.617\n", - "[3, 200] loss: 0.620\n", - "[3, 220] loss: 0.608\n", - "[3, 240] loss: 0.613\n", - "[3, 260] loss: 0.625\n", - "[3, 280] loss: 0.631\n", - "[3, 300] loss: 0.612\n", - "[3, 320] loss: 0.604\n", - "[3, 340] loss: 0.614\n", - "[3, 360] loss: 0.607\n", - "Epoch: 2, Val. loss: 0.62, Val. acc: 0.66\n", - "[4, 20] loss: 0.591\n", - "[4, 40] loss: 0.594\n", - "[4, 60] loss: 0.590\n", - "[4, 80] loss: 0.594\n", - "[4, 100] loss: 0.605\n", - "[4, 120] loss: 0.597\n", - "[4, 140] loss: 0.588\n", - "[4, 160] loss: 0.604\n", - "[4, 180] loss: 0.593\n", - "[4, 200] loss: 0.593\n", - "[4, 220] loss: 0.597\n", - "[4, 240] loss: 0.587\n", - "[4, 260] loss: 0.583\n", - "[4, 280] loss: 0.601\n", - "[4, 300] loss: 0.604\n", - "[4, 320] loss: 0.602\n", - "[4, 340] loss: 0.583\n", - "[4, 360] loss: 0.579\n", - "Epoch: 3, Val. loss: 0.6, Val. acc: 0.67\n", - "[5, 20] loss: 0.578\n", - "[5, 40] loss: 0.591\n", - "[5, 60] loss: 0.589\n", - "[5, 80] loss: 0.589\n", - "[5, 100] loss: 0.582\n", + "[3, 160] loss: 0.617\n", + "[3, 180] loss: 0.623\n", + "[3, 200] loss: 0.630\n", + "[3, 220] loss: 0.639\n", + "[3, 240] loss: 0.635\n", + "[3, 260] loss: 0.632\n", + "[3, 280] loss: 0.618\n", + "[3, 300] loss: 0.622\n", + "[3, 320] loss: 0.607\n", + "[3, 340] loss: 0.622\n", + "[3, 360] loss: 0.612\n", + "Epoch: 2, Val. loss: 0.61, Val. acc: 0.63\n", + "[4, 20] loss: 0.608\n", + "[4, 40] loss: 0.612\n", + "[4, 60] loss: 0.609\n", + "[4, 80] loss: 0.609\n", + "[4, 100] loss: 0.607\n", + "[4, 120] loss: 0.604\n", + "[4, 140] loss: 0.611\n", + "[4, 160] loss: 0.602\n", + "[4, 180] loss: 0.600\n", + "[4, 200] loss: 0.587\n", + "[4, 220] loss: 0.616\n", + "[4, 240] loss: 0.588\n", + "[4, 260] loss: 0.589\n", + "[4, 280] loss: 0.602\n", + "[4, 300] loss: 0.597\n", + "[4, 320] loss: 0.588\n", + "[4, 340] loss: 0.588\n", + "[4, 360] loss: 0.594\n", + "Epoch: 3, Val. loss: 0.58, Val. acc: 0.67\n", + "[5, 20] loss: 0.580\n", + "[5, 40] loss: 0.590\n", + "[5, 60] loss: 0.588\n", + "[5, 80] loss: 0.598\n", + "[5, 100] loss: 0.571\n", "[5, 120] loss: 0.562\n", - "[5, 140] loss: 0.589\n", - "[5, 160] loss: 0.570\n", - "[5, 180] loss: 0.591\n", - "[5, 200] loss: 0.574\n", - "[5, 220] loss: 0.557\n", - "[5, 240] loss: 0.585\n", - "[5, 260] loss: 0.571\n", - "[5, 280] loss: 0.582\n", - "[5, 300] loss: 0.576\n", - "[5, 320] loss: 0.576\n", - "[5, 340] loss: 0.572\n", - "[5, 360] loss: 0.565\n", - "Epoch: 4, Val. loss: 0.6, Val. acc: 0.66\n", - "[6, 20] loss: 0.558\n", - "[6, 40] loss: 0.549\n", - "[6, 60] loss: 0.574\n", - "[6, 80] loss: 0.576\n", - "[6, 100] loss: 0.571\n", - "[6, 120] loss: 0.575\n", - "[6, 140] loss: 0.560\n", - "[6, 160] loss: 0.571\n", - "[6, 180] loss: 0.547\n", - "[6, 200] loss: 0.563\n", - "[6, 220] loss: 0.551\n", - "[6, 240] loss: 0.551\n", - "[6, 260] loss: 0.571\n", - "[6, 280] loss: 0.573\n", - "[6, 300] loss: 0.574\n", - "[6, 320] loss: 0.567\n", - "[6, 340] loss: 0.569\n", - "[6, 360] loss: 0.591\n", + "[5, 140] loss: 0.585\n", + "[5, 160] loss: 0.567\n", + "[5, 180] loss: 0.581\n", + "[5, 200] loss: 0.560\n", + "[5, 220] loss: 0.566\n", + "[5, 240] loss: 0.577\n", + "[5, 260] loss: 0.596\n", + "[5, 280] loss: 0.584\n", + "[5, 300] loss: 0.587\n", + "[5, 320] loss: 0.580\n", + "[5, 340] loss: 0.566\n", + "[5, 360] loss: 0.579\n", + "Epoch: 4, Val. loss: 0.58, Val. acc: 0.68\n", + "[6, 20] loss: 0.582\n", + "[6, 40] loss: 0.571\n", + "[6, 60] loss: 0.572\n", + "[6, 80] loss: 0.561\n", + "[6, 100] loss: 0.578\n", + "[6, 120] loss: 0.576\n", + "[6, 140] loss: 0.583\n", + "[6, 160] loss: 0.568\n", + "[6, 180] loss: 0.574\n", + "[6, 200] loss: 0.571\n", + "[6, 220] loss: 0.557\n", + "[6, 240] loss: 0.570\n", + "[6, 260] loss: 0.564\n", + "[6, 280] loss: 0.577\n", + "[6, 300] loss: 0.568\n", + "[6, 320] loss: 0.585\n", + "[6, 340] loss: 0.567\n", + "[6, 360] loss: 0.575\n", "Epoch: 5, Val. loss: 0.57, Val. acc: 0.69\n", - "[7, 20] loss: 0.554\n", - "[7, 40] loss: 0.542\n", - "[7, 60] loss: 0.551\n", - "[7, 80] loss: 0.553\n", - "[7, 100] loss: 0.555\n", - "[7, 120] loss: 0.549\n", - "[7, 140] loss: 0.544\n", - "[7, 160] loss: 0.569\n", - "[7, 180] loss: 0.557\n", - "[7, 200] loss: 0.574\n", - "[7, 220] loss: 0.552\n", - "[7, 240] loss: 0.549\n", - "[7, 260] loss: 0.527\n", - "[7, 280] loss: 0.545\n", - "[7, 300] loss: 0.569\n", - "[7, 320] loss: 0.555\n", - "[7, 340] loss: 0.559\n", - "[7, 360] loss: 0.571\n", - "Epoch: 6, Val. loss: 0.55, Val. acc: 0.68\n", - "[8, 20] loss: 0.541\n", - "[8, 40] loss: 0.562\n", + "[7, 20] loss: 0.542\n", + "[7, 40] loss: 0.564\n", + "[7, 60] loss: 0.538\n", + "[7, 80] loss: 0.548\n", + "[7, 100] loss: 0.552\n", + "[7, 120] loss: 0.557\n", + "[7, 140] loss: 0.547\n", + "[7, 160] loss: 0.580\n", + "[7, 180] loss: 0.565\n", + "[7, 200] loss: 0.580\n", + "[7, 220] loss: 0.558\n", + "[7, 240] loss: 0.537\n", + "[7, 260] loss: 0.564\n", + "[7, 280] loss: 0.569\n", + "[7, 300] loss: 0.547\n", + "[7, 320] loss: 0.559\n", + "[7, 340] loss: 0.560\n", + "[7, 360] loss: 0.578\n", + "Epoch: 6, Val. loss: 0.56, Val. acc: 0.69\n", + "[8, 20] loss: 0.542\n", + "[8, 40] loss: 0.553\n", "[8, 60] loss: 0.540\n", - "[8, 80] loss: 0.536\n", - "[8, 100] loss: 0.537\n", - "[8, 120] loss: 0.527\n", - "[8, 140] loss: 0.547\n", - "[8, 160] loss: 0.551\n", - "[8, 180] loss: 0.557\n", - "[8, 200] loss: 0.525\n", - "[8, 220] loss: 0.539\n", - "[8, 240] loss: 0.561\n", - "[8, 260] loss: 0.563\n", - "[8, 280] loss: 0.546\n", - "[8, 300] loss: 0.549\n", - "[8, 320] loss: 0.531\n", - "[8, 340] loss: 0.537\n", - "[8, 360] loss: 0.543\n", - "Epoch: 7, Val. loss: 0.56, Val. acc: 0.7\n", - "[9, 20] loss: 0.532\n", - "[9, 40] loss: 0.548\n", - "[9, 60] loss: 0.533\n", - "[9, 80] loss: 0.536\n", - "[9, 100] loss: 0.517\n", + "[8, 80] loss: 0.551\n", + "[8, 100] loss: 0.544\n", + "[8, 120] loss: 0.537\n", + "[8, 140] loss: 0.559\n", + "[8, 160] loss: 0.542\n", + "[8, 180] loss: 0.552\n", + "[8, 200] loss: 0.536\n", + "[8, 220] loss: 0.551\n", + "[8, 240] loss: 0.556\n", + "[8, 260] loss: 0.540\n", + "[8, 280] loss: 0.573\n", + "[8, 300] loss: 0.543\n", + "[8, 320] loss: 0.555\n", + "[8, 340] loss: 0.550\n", + "[8, 360] loss: 0.539\n", + "Epoch: 7, Val. loss: 0.55, Val. acc: 0.7\n", + "[9, 20] loss: 0.551\n", + "[9, 40] loss: 0.562\n", + "[9, 60] loss: 0.548\n", + "[9, 80] loss: 0.557\n", + "[9, 100] loss: 0.554\n", "[9, 120] loss: 0.555\n", - "[9, 140] loss: 0.512\n", - "[9, 160] loss: 0.523\n", - "[9, 180] loss: 0.525\n", - "[9, 200] loss: 0.529\n", - "[9, 220] loss: 0.560\n", - "[9, 240] loss: 0.528\n", - "[9, 260] loss: 0.527\n", - "[9, 280] loss: 0.551\n", - "[9, 300] loss: 0.550\n", - "[9, 320] loss: 0.560\n", - "[9, 340] loss: 0.548\n", - "[9, 360] loss: 0.541\n", - "Epoch: 8, Val. loss: 0.54, Val. acc: 0.7\n", - "[10, 20] loss: 0.533\n", - "[10, 40] loss: 0.544\n", - "[10, 60] loss: 0.540\n", - "[10, 80] loss: 0.522\n", - "[10, 100] loss: 0.540\n", - "[10, 120] loss: 0.513\n", - "[10, 140] loss: 0.538\n", - "[10, 160] loss: 0.523\n", - "[10, 180] loss: 0.518\n", - "[10, 200] loss: 0.517\n", - "[10, 220] loss: 0.529\n", - "[10, 240] loss: 0.537\n", - "[10, 260] loss: 0.549\n", - "[10, 280] loss: 0.502\n", - "[10, 300] loss: 0.535\n", - "[10, 320] loss: 0.540\n", - "[10, 340] loss: 0.522\n", - "[10, 360] loss: 0.547\n", - "Epoch: 9, Val. loss: 0.53, Val. acc: 0.71\n", + "[9, 140] loss: 0.550\n", + "[9, 160] loss: 0.543\n", + "[9, 180] loss: 0.539\n", + "[9, 200] loss: 0.545\n", + "[9, 220] loss: 0.544\n", + "[9, 240] loss: 0.538\n", + "[9, 260] loss: 0.546\n", + "[9, 280] loss: 0.548\n", + "[9, 300] loss: 0.536\n", + "[9, 320] loss: 0.538\n", + "[9, 340] loss: 0.542\n", + "[9, 360] loss: 0.545\n", + "Epoch: 8, Val. loss: 0.54, Val. acc: 0.71\n", + "[10, 20] loss: 0.519\n", + "[10, 40] loss: 0.523\n", + "[10, 60] loss: 0.521\n", + "[10, 80] loss: 0.537\n", + "[10, 100] loss: 0.525\n", + "[10, 120] loss: 0.525\n", + "[10, 140] loss: 0.532\n", + "[10, 160] loss: 0.536\n", + "[10, 180] loss: 0.543\n", + "[10, 200] loss: 0.544\n", + "[10, 220] loss: 0.556\n", + "[10, 240] loss: 0.544\n", + "[10, 260] loss: 0.558\n", + "[10, 280] loss: 0.505\n", + "[10, 300] loss: 0.548\n", + "[10, 320] loss: 0.539\n", + "[10, 340] loss: 0.534\n", + "[10, 360] loss: 0.528\n", + "Epoch: 9, Val. loss: 0.54, Val. acc: 0.71\n", "Finished Training\n" ] } @@ -843,9 +922,16 @@ "import torch.optim as optim\n", "\n", "model = GNN(D= 100, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)\n", + "if pre_training:\n", + " model.load_state_dict(torch.load(join(CURRENT_DIR, \"..\" ,\"data\", \"substrate_data_KM\", \"GNN\", \"Pytorch_GNN_KM\")))\n", + "\n", "criterion = nn.BCELoss()\n", "optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay= 0.00001)\n", "\n", + "\n", + " \n", + "\n", + "\n", "for epoch in range(10): # loop over the dataset multiple times\n", " model.train()\n", " running_loss = 0.0\n", @@ -891,152 +977,13 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "torch.save(model.state_dict(),join(CURRENT_DIR, \"..\" ,\"data\", \"substrate_data\", \"GNN\", \"Pytorch_GNN_V2\"))" + "torch.save(model.state_dict(),join(CURRENT_DIR, \"..\" ,\"data\", \"substrate_data\", \"GNN\", \"Pytorch_GNN_with_pretraining\"))" ] }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1, 20] loss: 0.527\n", - "[1, 40] loss: 0.524\n", - "[1, 60] loss: 0.517\n", - "[1, 80] loss: 0.511\n", - "[1, 100] loss: 0.522\n", - "[1, 120] loss: 0.519\n", - "[1, 140] loss: 0.520\n", - "[1, 160] loss: 0.507\n", - "[1, 180] loss: 0.515\n", - "[1, 200] loss: 0.526\n", - "[1, 220] loss: 0.511\n", - "[1, 240] loss: 0.527\n", - "[1, 260] loss: 0.528\n", - "[1, 280] loss: 0.522\n", - "[1, 300] loss: 0.518\n", - "[1, 320] loss: 0.524\n", - "[1, 340] loss: 0.531\n", - "[1, 360] loss: 0.545\n", - "Epoch: 0, Val. loss: 0.54, Val. acc: 0.71\n", - "[2, 20] loss: 0.536\n", - "[2, 40] loss: 0.537\n", - "[2, 60] loss: 0.502\n", - "[2, 80] loss: 0.527\n", - "[2, 100] loss: 0.544\n", - "[2, 120] loss: 0.522\n", - "[2, 140] loss: 0.519\n", - "[2, 160] loss: 0.536\n", - "[2, 180] loss: 0.522\n", - "[2, 200] loss: 0.519\n", - "[2, 220] loss: 0.519\n", - "[2, 240] loss: 0.526\n", - "[2, 260] loss: 0.500\n", - "[2, 280] loss: 0.536\n", - "[2, 300] loss: 0.507\n", - "[2, 320] loss: 0.514\n", - "[2, 340] loss: 0.527\n", - "[2, 360] loss: 0.512\n", - "Epoch: 1, Val. loss: 0.53, Val. acc: 0.72\n", - "[3, 20] loss: 0.533\n", - "[3, 40] loss: 0.522\n", - "[3, 60] loss: 0.498\n", - "[3, 80] loss: 0.493\n", - "[3, 100] loss: 0.496\n", - "[3, 120] loss: 0.518\n", - "[3, 140] loss: 0.524\n", - "[3, 160] loss: 0.508\n", - "[3, 180] loss: 0.523\n", - "[3, 200] loss: 0.510\n", - "[3, 220] loss: 0.506\n", - "[3, 240] loss: 0.509\n", - "[3, 260] loss: 0.524\n", - "[3, 280] loss: 0.519\n", - "[3, 300] loss: 0.520\n", - "[3, 320] loss: 0.516\n", - "[3, 340] loss: 0.493\n", - "[3, 360] loss: 0.531\n", - "Epoch: 2, Val. loss: 0.52, Val. acc: 0.72\n", - "Finished Training\n" - ] - } - ], - "source": [ - "for epoch in range(3): # loop over the dataset multiple times\n", - " model.train()\n", - " running_loss = 0.0\n", - " for i, [XE, X, A,ESM1b, labels] in enumerate(train_loader):\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - " XE, X, A, ESM1b, labels = XE.to(device), X.to(device), A.to(device),ESM1b.to(device), labels.to(device)\n", - " # forward + backward + optimize\n", - " outputs = model(XE, X, A, ESM1b)\n", - " loss = criterion(outputs, labels.view((batch_size,-1)))\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " # print statistics\n", - " running_loss += loss.item()\n", - " if i % 20 == 19: # print every 2000 mini-batches\n", - " print('[%d, %5d] loss: %.3f' %\n", - " (epoch + 1, i + 1, running_loss / 20))\n", - " running_loss = 0.0\n", - " \n", - " #After each epoch, calculate the validation loss:\n", - " running_acc = 0.0\n", - " running_loss = 0.0\n", - " model.eval()\n", - " for i, [XE, X, A,ESM1b, labels] in enumerate(test_loader):\n", - " XE, X, A, ESM1b, labels = XE.to(device), X.to(device), A.to(device),ESM1b.to(device), labels.to(device)\n", - " \n", - " with torch.no_grad():\n", - " outputs = model(XE, X, A, ESM1b)\n", - " loss = criterion(outputs, labels.view((batch_size,-1)))\n", - " running_loss += loss.item()\n", - "\n", - " outputs2 = np.round(outputs.view(-1).cpu().detach().numpy()) \n", - " labels2 = labels.cpu().detach().numpy()\n", - " acc = np.mean([outputs2[i] == labels2[i] for i in range(len(labels))])\n", - " running_acc += acc\n", - "\n", - " print(\"Epoch: %s, Val. loss: %s, Val. acc: %s\" % (epoch, np.round(running_loss/(i+1),2),\n", - " np.round(running_acc/(i+1), 2)))\n", - "\n", - "print('Finished Training')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "GNN(\n", - " (BN1): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (BN2): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (BN3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (linear1): Linear(in_features=100, out_features=32, bias=True)\n", - " (linear2): Linear(in_features=32, out_features=1, bias=True)\n", - " (drop_layer): Dropout(p=0.2, inplace=False)\n", - ")" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -1053,30 +1000,30 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GNN(\n", - " (BN1): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (BN2): BatchNorm2d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (BN3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (linear1): Linear(in_features=100, out_features=32, bias=True)\n", + " (BN1): BatchNorm2d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (BN2): BatchNorm2d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (BN3): BatchNorm1d(150, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (linear1): Linear(in_features=150, out_features=32, bias=True)\n", " (linear2): Linear(in_features=32, out_features=1, bias=True)\n", " (drop_layer): Dropout(p=0.2, inplace=False)\n", ")" ] }, - "execution_count": 2, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "model = GNN(D= 50, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)\n", - "model.load_state_dict(torch.load(join(CURRENT_DIR, \"..\" ,\"data\", \"substrate_data\", \"GNN\",\"Pytorch_GNN\")))\n", + "model = GNN(D= 100, N = 70, F1 = 32 , F2 = 10, F = F1+F2).to(device)\n", + "model.load_state_dict(torch.load(join(CURRENT_DIR, \"..\" ,\"data\", \"substrate_data\", \"GNN\", \"Pytorch_GNN_with_pretraining\")))\n", "model.eval()" ] }, @@ -1089,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1106,7 +1053,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1166,53 +1113,53 @@ "1352 rows × 2 columns
\n", + "1360 rows × 2 columns
\n", "" ], "text/plain": [ - " molecule ID uid\n", - "0 C00001 P9WIQ3\n", - "1 C00002 P9WIQ3\n", - "2 C00003 P9WIQ3\n", - "3 C00004 P9WIQ3\n", - "4 C00005 P9WIQ3\n", - "... ... ...\n", - "1347 CHEBI_85986 P9WIQ3\n", - "1348 CHEBI_86339 P9WIQ3\n", - "1349 CHEBI_87136 P9WIQ3\n", - "1350 CHEBI_87305 P9WIQ3\n", - "1351 CHEBI_88052 P9WIQ3\n", + " molecule ID uid\n", + "0 C00001 P9WIQ3\n", + "1 C00002 P9WIQ3\n", + "2 C00003 P9WIQ3\n", + "3 C00004 P9WIQ3\n", + "4 C00005 P9WIQ3\n", + "... ... ...\n", + "1355 CHEBI_88052 P9WIQ3\n", + "1356 InChI=1SQC18H36O3Qc1-2-3-4-5-6-7-8-9-10-11-12-... P9WIQ3\n", + "1357 InChI=1SQC3H6O3Qc1-2(4)3(5)6Qh2,4H,1H3,(H,5,6)... P9WIQ3\n", + "1358 InChI=1SQC8H16O3Qc1-2-3-4-5-6-7(9)8(10)11Qh7,9... P9WIQ3\n", + "1359 InChI=1SQC8H8O3Qc9-7(8(10)11)6-4-2-1-3-5-6Qh1-... P9WIQ3\n", "\n", - "[1352 rows x 2 columns]" + "[1360 rows x 2 columns]" ] }, - "execution_count": 4, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1230,7 +1177,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -1283,9 +1230,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 16, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21\n" + ] + }, { "data": { "text/html": [ @@ -1317,31 +1271,31 @@ "1352 rows × 3 columns
\n", + "1360 rows × 3 columns
\n", "" ], "text/plain": [ - " molecule ID uid substrate_rep\n", - "0 C00001 P9WIQ3 [0.029613253, 0.0, 0.26818, 0.0, 0.13194986, 0...\n", - "1 C00002 P9WIQ3 [1238.0188, 0.0, 0.0, 42.365837, 74.54658, 28....\n", - "2 C00003 P9WIQ3 [2111.6353, 0.0, 0.0, 139.63763, 183.79233, 33...\n", - "3 C00004 P9WIQ3 [1813.2203, 60.59075, 0.0, 224.62354, 335.9574...\n", - "4 C00005 P9WIQ3 [1859.3567, 120.4939, 0.0, 255.73376, 384.9909...\n", - "... ... ... ...\n", - "1347 CHEBI_85986 P9WIQ3 [673.1395, 31.783257, 0.0, 15.939333, 38.13377...\n", - "1348 CHEBI_86339 P9WIQ3 [6.3847322, 18.652742, 0.0, 76.061226, 61.6412...\n", - "1349 CHEBI_87136 P9WIQ3 [1872.2444, 96.23757, 0.0, 217.94662, 388.0197...\n", - "1350 CHEBI_87305 P9WIQ3 [1924.4116, 97.580894, 0.0, 217.94662, 384.225...\n", - "1351 CHEBI_88052 P9WIQ3 [82.04205, 2.6866338, 36.013027, 88.17541, 116...\n", + " molecule ID uid \\\n", + "0 C00001 P9WIQ3 \n", + "1 C00002 P9WIQ3 \n", + "2 C00003 P9WIQ3 \n", + "3 C00004 P9WIQ3 \n", + "4 C00005 P9WIQ3 \n", + "... ... ... \n", + "1355 CHEBI_88052 P9WIQ3 \n", + "1356 InChI=1SQC18H36O3Qc1-2-3-4-5-6-7-8-9-10-11-12-... P9WIQ3 \n", + "1357 InChI=1SQC3H6O3Qc1-2(4)3(5)6Qh2,4H,1H3,(H,5,6)... P9WIQ3 \n", + "1358 InChI=1SQC8H16O3Qc1-2-3-4-5-6-7(9)8(10)11Qh7,9... P9WIQ3 \n", + "1359 InChI=1SQC8H8O3Qc9-7(8(10)11)6-4-2-1-3-5-6Qh1-... P9WIQ3 \n", "\n", - "[1352 rows x 3 columns]" + " substrate_rep \n", + "0 [0.0, 0.6329006, 0.0, 44.773804, 41.644196, 21... \n", + "1 [0.0, 46.134777, 0.0, 0.0, 1177.1073, 187.5388... \n", + "2 [0.0, 8.150052, 0.0, 8.550125, 40.956882, 66.0... \n", + "3 [0.0, 1.8680593, 0.0, 0.0, 44.121048, 557.9273... \n", + "4 [0.0, 0.0, 0.0, 0.0, 183.79741, 173.6141, 0.0,... \n", + "... ... \n", + "1355 [0.0, 46.631508, 0.0, 0.0, 2.3734078, 1.361197... \n", + "1356 [0.0, 0.2668656, 0.0, 0.0, 0.784772, 10.25194,... \n", + "1357 [0.0, 0.9508717, 0.0, 90.905556, 0.79774696, 0... \n", + "1358 [0.0, 0.2668656, 0.0, 0.0, 0.784772, 10.25194,... \n", + "1359 [0.0, 0.0, 0.0, 32.41623, 66.58023, 126.79576,... \n", + "\n", + "[1360 rows x 3 columns]" ] }, - "execution_count": 7, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1413,77 +1380,69 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "df_train = pd.read_pickle(join(CURRENT_DIR, \"..\" ,\"data\", \"splits\", \"df_train_with_ESM1b_ts_GNN.pkl\"))\n", + "df_test = pd.read_pickle(join(CURRENT_DIR, \"..\" ,\"data\", \"splits\", \"df_test_with_ESM1b_ts_GNN.pkl\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\alexk\\AppData\\Local\\Temp/ipykernel_6620/2047707114.py:4: SettingWithCopyWarning: \n", + "C:\\Users\\alexk\\anaconda3\\envs\\Predicting_Km\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_train[\"GNN rep\"][ind] = list(df_mols[\"substrate_rep\"].loc[df_mols[\"molecule ID\"] == df_train[\"molecule ID\"][ind].replace(\":\", \"_\")])[0]\n" + " after removing the cwd from sys.path.\n" ] } ], "source": [ - "df_train[\"GNN rep\"] = \"\"\n", + "df_train[\"GNN rep (pretrained)\"] = \"\"\n", "for ind in df_train.index:\n", " try:\n", - " df_train[\"GNN rep\"][ind] = list(df_mols[\"substrate_rep\"].loc[df_mols[\"molecule ID\"] == df_train[\"molecule ID\"][ind].replace(\":\", \"_\")])[0]\n", + " df_train[\"GNN rep (pretrained)\"][ind] = list(df_mols[\"substrate_rep\"].loc[df_mols[\"molecule ID\"] == df_train[\"molecule ID\"][ind].replace(\":\", \"_\").replace(\"Q\", \"/\")])[0]\n", " except IndexError:\n", " pass" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\alexk\\AppData\\Local\\Temp/ipykernel_6620/451796927.py:4: SettingWithCopyWarning: \n", + "C:\\Users\\alexk\\anaconda3\\envs\\Predicting_Km\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_test[\"GNN rep\"][ind] = list(df_mols[\"substrate_rep\"].loc[df_mols[\"molecule ID\"] == df_test[\"molecule ID\"][ind].replace(\":\", \"_\")])[0]\n" + " after removing the cwd from sys.path.\n" ] } ], "source": [ - "df_test[\"GNN rep\"] = \"\"\n", + "df_test[\"GNN rep (pretrained)\"] = \"\"\n", "for ind in df_test.index:\n", " try:\n", - " df_test[\"GNN rep\"][ind] = list(df_mols[\"substrate_rep\"].loc[df_mols[\"molecule ID\"] == df_test[\"molecule ID\"][ind].replace(\":\", \"_\")])[0]\n", + " df_test[\"GNN rep (pretrained)\"][ind] = list(df_mols[\"substrate_rep\"].loc[df_mols[\"molecule ID\"] == df_test[\"molecule ID\"][ind].replace(\":\", \"_\").replace(\"Q\", \"/\")])[0]\n", " except IndexError:\n", " pass" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "'''df_engqvist[\"molecule ID\"] = df_engqvist[\"substrate\"]\n", - "\n", - "df_engqvist[\"GNN rep\"] = \"\"\n", - "for ind in df_engqvist.index:\n", - " try:\n", - " df_engqvist[\"GNN rep\"][ind] = list(df_mols[\"substrate_rep\"].loc[df_mols[\"molecule ID\"] == df_engqvist[\"molecule ID\"][ind].replace(\":\", \"_\")])[0]\n", - " except IndexError:\n", - " pass\n", - " \n", - "df_engqvist''';" - ] - }, - { - "cell_type": "code", - "execution_count": 12, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [