diff --git a/scripts/evaluate-generated-mofs/0_effect-of-training.ipynb b/scripts/evaluate-generated-mofs/0_effect-of-training.ipynb deleted file mode 100644 index 0ca3b6a4..00000000 --- a/scripts/evaluate-generated-mofs/0_effect-of-training.ipynb +++ /dev/null @@ -1,467 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "1719a71e-77b6-4edf-aea7-06656446ba2f", - "metadata": {}, - "source": [ - "# Are MOFs Generated by Later Models Better?\n", - "We periodically retrain the DiffLinker, and hope that the ones generated by later interations of the model are better." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "9b732ebf-fe93-4610-bd44-9430b14f79fe", - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "from matplotlib import pyplot as plt\n", - "from datetime import datetime\n", - "from pathlib import Path\n", - "from tqdm import tqdm\n", - "import pandas as pd\n", - "import json\n", - "import gzip" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "9efc8697-48e3-4fc5-8181-63ecb29c4fba", - "metadata": {}, - "outputs": [], - "source": [ - "run_dir = Path('../prod-runs/256-nodes/')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "7d8cb69f-e7ba-4aee-8652-0fed12d6eaf8", - "metadata": {}, - "outputs": [], - "source": [ - "Path('figures').mkdir(exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "id": "88b14321-1e9c-48c0-872a-7fcbd734ac13", - "metadata": {}, - "source": [ - "## Load the Data from Disk\n", - "And make it compact\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "f510924d-3905-41db-aa68-2aecfbe8e75a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "32341it [01:43, 312.62it/s]\n" - ] - } - ], - "source": [ - "records = []\n", - "with gzip.open(run_dir / 'mofs.json.gz', 'rt') as fp:\n", - " for line in tqdm(fp):\n", - " record = json.loads(line)\n", - "\n", - " # Remove structure data, label linkers by anchor\n", - " for k in ['md_trajectory', 'nodes', 'structure', '_id']:\n", - " del record[k]\n", - " for ligand in record.pop('ligands'):\n", - " record[f'ligand.{ligand[\"anchor_type\"]}'] = ligand\n", - " for k in ['xyz', 'dummy_element', 'anchor_type']:\n", - " del ligand[k]\n", - "\n", - " record['time'] = record.pop('times')['created']['$date']\n", - " records.append(pd.json_normalize(record))\n", - "records = pd.concat(records, ignore_index=True)" - ] - }, - { - "cell_type": "markdown", - "id": "b16f764b-dfa9-4928-99e6-898155e24e45", - "metadata": {}, - "source": [ - "Store the model versions" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "6445ea74-8c5b-43bf-97b1-c680a9d3f1a3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | name | \n", - "topology | \n", - "catenation | \n", - "time | \n", - "gas_storage.CO2 | \n", - "structure_stability.uff | \n", - "ligand.COO.name | \n", - "ligand.COO.smiles | \n", - "ligand.COO.prompt_atoms | \n", - "ligand.COO.metadata.model_version | \n", - "ligand.cyano.name | \n", - "ligand.cyano.smiles | \n", - "ligand.cyano.prompt_atoms | \n", - "ligand.cyano.metadata.model_version | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "mof-00a88ea5 | \n", - "None | \n", - "None | \n", - "2024-04-13T23:18:21.584Z | \n", - "[10000.0, 0.0862266618] | \n", - "0.209704 | \n", - "ligand-a1037294 | \n", - "O=C([O-])c1ccc(C=C=[S+2]=C/[C-]=C/c2ccc(C(=O)O... | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", - "0 | \n", - "ligand-6536db2c | \n", - "N#Cc1ccc(C#CC#CC#Cc2ccc(C#N)cc2)cc1 | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", - "0 | \n", - "
1 | \n", - "mof-89fd0977 | \n", - "None | \n", - "None | \n", - "2024-04-13T23:18:00.040Z | \n", - "[10000.0, 0.0756631463] | \n", - "0.228959 | \n", - "ligand-0bb2fcf6 | \n", - "[O-][C+](O)[C-]1[CH+][CH+][C+]([C][C][CH+][C][... | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", - "0 | \n", - "ligand-b53051b9 | \n", - "[N-2][C][c+]1[cH-][cH+][c-]([C][C][CH+][C][C][... | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", - "0 | \n", - "
2 | \n", - "mof-1ef2070f | \n", - "None | \n", - "None | \n", - "2024-04-13T23:17:59.318Z | \n", - "[10000.0, 0.2165945735] | \n", - "0.201553 | \n", - "ligand-0fa742f0 | \n", - "O=C([O-])c1ccc(C#C/[S+]=C/C#Cc2ccc(C(=O)O)cc2)cc1 | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", - "0 | \n", - "ligand-145bcf48 | \n", - "[N-2][C][c+]1[cH-][cH+][c-]([C][CH+][CH+][C][C... | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", - "0 | \n", - "
3 | \n", - "mof-b3a06d94 | \n", - "None | \n", - "None | \n", - "2024-04-13T23:18:01.291Z | \n", - "[10000.0, 0.0723458327] | \n", - "0.073941 | \n", - "ligand-0fa742f0 | \n", - "O=C([O-])c1ccc(C#C/[S+]=C/C#Cc2ccc(C(=O)O)cc2)cc1 | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", - "0 | \n", - "ligand-818dbabc | \n", - "[N-2][C][c+]1[cH-][cH+][c-]([C][CH+][CH+][N-][... | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", - "0 | \n", - "
4 | \n", - "mof-9d97b1ec | \n", - "None | \n", - "None | \n", - "2024-04-13T23:18:05.109Z | \n", - "[10000.0, 0.0714736096] | \n", - "0.260291 | \n", - "ligand-0bb2fcf6 | \n", - "[O-][C+](O)[C-]1[CH+][CH+][C+]([C][C][CH+][C][... | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", - "0 | \n", - "ligand-b9f216ab | \n", - "[N-2][C][c+]1[cH-][cH+][c-]([C][C][CH+][C][C][... | \n", - "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", - "0 | \n", - "
\n", + " | name | \n", + "topology | \n", + "catenation | \n", + "time | \n", + "gas_storage.CO2 | \n", + "structure_stability.uff | \n", + "ligand.COO.name | \n", + "ligand.COO.smiles | \n", + "ligand.COO.prompt_atoms | \n", + "ligand.COO.metadata.model_version | \n", + "ligand.cyano.name | \n", + "ligand.cyano.smiles | \n", + "ligand.cyano.prompt_atoms | \n", + "ligand.cyano.metadata.model_version | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "mof-0c85fcdc | \n", + "None | \n", + "None | \n", + "2024-10-05T18:34:31.991Z | \n", + "0.070712 | \n", + "0.198765 | \n", + "ligand-f36f085a | \n", + "O=C([O-])c1ccc(C=[N+]=C=CC#Cc2ccc(C(=O)O)cc2)cc1 | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", + "0 | \n", + "ligand-b1ae0877 | \n", + "[N-2][C][c+]1[cH-][cH+][c-]([C][CH+][CH+][C][C... | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", + "0 | \n", + "
1 | \n", + "mof-71f74056 | \n", + "None | \n", + "None | \n", + "2024-10-05T18:34:31.269Z | \n", + "0.087737 | \n", + "0.201572 | \n", + "ligand-f36f085a | \n", + "O=C([O-])c1ccc(C=[N+]=C=CC#Cc2ccc(C(=O)O)cc2)cc1 | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", + "0 | \n", + "ligand-fd4b41eb | \n", + "[N-2][C][c+]1[cH-][cH+][c-]([C][CH+][CH+][C][C... | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", + "0 | \n", + "
2 | \n", + "mof-12eebf0f | \n", + "None | \n", + "None | \n", + "2024-10-05T18:34:30.214Z | \n", + "0.074791 | \n", + "0.288125 | \n", + "ligand-f36f085a | \n", + "O=C([O-])c1ccc(C=[N+]=C=CC#Cc2ccc(C(=O)O)cc2)cc1 | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", + "0 | \n", + "ligand-a3cb2664 | \n", + "[N-2][C][c+]1[cH-][cH+][c-]([C][C][C][C][C][CH... | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", + "0 | \n", + "
3 | \n", + "mof-28b7f140 | \n", + "None | \n", + "None | \n", + "2024-10-05T18:34:30.073Z | \n", + "0.094516 | \n", + "0.157505 | \n", + "ligand-f36f085a | \n", + "O=C([O-])c1ccc(C=[N+]=C=CC#Cc2ccc(C(=O)O)cc2)cc1 | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", + "0 | \n", + "ligand-7cc6d992 | \n", + "[N-2][C][c+]1[cH-][cH+][c-]([C][C][CH+][C][C][... | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", + "0 | \n", + "
4 | \n", + "mof-93190b3c | \n", + "None | \n", + "None | \n", + "2024-10-05T18:34:30.003Z | \n", + "0.079115 | \n", + "0.230479 | \n", + "ligand-f36f085a | \n", + "O=C([O-])c1ccc(C=[N+]=C=CC#Cc2ccc(C(=O)O)cc2)cc1 | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13... | \n", + "0 | \n", + "ligand-d6b66392 | \n", + "[N-2][C][c+]1[cH-][cH+][c-]([C][C][CH+][CH+][C... | \n", + "[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, ... | \n", + "0 | \n", + "
\n", + " | \n", + " | found | \n", + "found_node-hr | \n", + "
---|---|---|---|
nodes | \n", + "retrain | \n", + "\n", + " | \n", + " |
32 | \n", + "False | \n", + "133.0 | \n", + "2.770833 | \n", + "
True | \n", + "313.0 | \n", + "6.520833 | \n", + "|
64 | \n", + "False | \n", + "426.5 | \n", + "4.442708 | \n", + "
True | \n", + "641.0 | \n", + "6.677083 | \n", + "|
128 | \n", + "True | \n", + "1622.0 | \n", + "8.447917 | \n", + "
256 | \n", + "True | \n", + "3633.0 | \n", + "9.460938 | \n", + "
450 | \n", + "True | \n", + "6554.0 | \n", + "9.709630 | \n", + "