diff --git a/10Cube.ipynb b/10Cube.ipynb index b3393d6..24d5175 100644 --- a/10Cube.ipynb +++ b/10Cube.ipynb @@ -111,7 +111,10 @@ "outputs": [], "source": [ "angles = np.pi / 4 * np.ones(number_subsystems//2)\n", + "# training data with 100000 steps\n", "hidden_state_traj, observable_traj = toymodel.generate_traj(100000, angles=angles, dim_noise=dim_noise)\n", + "\n", + "# validation data with 10000 steps\n", "hidden_state_traj_valid, observable_traj_valid = toymodel.generate_traj(10000, angles=angles, dim_noise=dim_noise)" ] }, @@ -266,7 +269,7 @@ "metadata": {}, "outputs": [], "source": [ - "tensorboard_installed = True\n", + "tensorboard_installed = False\n", "if tensorboard_installed:\n", " from torch.utils.tensorboard import SummaryWriter\n", " writer = SummaryWriter('./runs/Cube10/')\n", @@ -301,6 +304,16 @@ " lam_trace=0, start_mask=0, end_trace=0, tb_writer=writer, clip=False).fetch_model()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "95548c29", + "metadata": {}, + "outputs": [], + "source": [ + "# execution time (on cpu): ~ 4.5 min" + ] + }, { "cell_type": "markdown", "id": "3984d128", @@ -341,6 +354,16 @@ "plot_mask(mask, vmax=0.5, skip=2)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0d74aeb", + "metadata": {}, + "outputs": [], + "source": [ + "# reproduces Fig. 4c (or a permutation with respect to ivampnet state assignments)" + ] + }, { "cell_type": "markdown", "id": "143348c9", @@ -396,12 +419,14 @@ "id": "90807a5c", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# reproduces Fig. 4d" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -415,7 +440,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.8" } }, "nbformat": 4, diff --git a/README.md b/README.md index 19ffd7d..54fc626 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,17 @@ Codebase for the iVAMPnets estimator and model which includes the classes for constructing the masks for toymodels and real protein applications. The implemented methods allow to decompose a possible high dimensional system in its weakly coupled or independent subsystems. Thereby, the downstream estimation of the kinetic models is much more data efficient than estimating a global kinetic model which might not be feasible. The whole pipeline is an end-to-end deep learning framework which allows to define your own network architectures for the kinetics estimation of each subsystem. -The data for the synaptotagmin C2A system is available upon request. The code is designed to reproduce the results of our paper "A deep learning framework for the decomposition of macromolecules into independent VAMPnets" (Link will be added) and is based on the deeptime package (see https://deeptime-ml.github.io/latest/index.html). +The data for the synaptotagmin C2A system is available upon request. The code is designed to reproduce the results of our paper "Deep learning to decompose macromolecules into independent Markovian domains" (https://www.biorxiv.org/content/10.1101/2022.03.30.486366v1) and is based on the deeptime package (see https://deeptime-ml.github.io/latest/index.html). The code includes: 1. (ivampnets.py) The definition of the ivampnets estimator class, which allows to fit a given model to simulation data. The definition of the ivampnets model class - the resulting model - which can then be used to estimate transition matrices, implied timescales, eigenfunctions, etc. 2. (masks.py) The definition of the mask modules, which can be used to give the modeler an intuition which part of the global system is assigned to which subsystem. 3. (examples.py) Helper functions to generate the data for the toy systems and plot some results. -4. (Toymodel_2Systems.ipynb) Notebook to reproduce the results for a simple truly independent 2D system. -5. (10Cube.ipynb) Notebook to reproduce the results for the 10-Cube example. -6. (SynaptotagminC2A.ipynb) Notebook to reproduce the results for a protein example. The data of the synaptotagmin C2A domain is available upon request. +4. (Toymodel_2Systems.ipynb) Notebook to reproduce the results for a simple truly independent 2D system. Typical runtime (cpu): 2 min +5. (10Cube.ipynb) Notebook to reproduce the results for the 10-Cube example. Typical runtime (cpu): 5 min +6. (SynaptotagminC2A.ipynb) Notebook to reproduce the results for a protein example. The data of the synaptotagmin C2A domain is available upon request. Typical runtime (cuda): 1.5 hours -The code was executed using the following package versions: +The code was executed using the following package versions on a linux computer (debian bullseye): ``` python=3.6 or higher @@ -29,3 +29,34 @@ tensorboard=2.6.0 h5py=1.10.4 ``` +## Installation instructions + +The software dependencies can be installed with anaconda / miniconda. If you do not have miniconda or anaconda, please follow the instructions here: https://conda.io/miniconda.html + +The following command can be used to create a new conda environment and install all dependencies for the ivampnets scripts. +```bash +conda create -n ivampnets pytorch=1.8.0 deeptime=0.2.9 numpy=1.19.5 matplotlib=3.1.3 jupyter h5py -c conda-forge +``` +The new conda environment can be activated with +```bash +conda activate ivampnets +``` + + +In case you are already a conda and jupyter notebook user with various environments, you can install your environment Python kernel via +```bash +python -m ipykernel install --user --name ivampnets +``` +This repository including the python scripts and jupyter notebooks can be downloaded with +```bash +git clone git@github.com:markovmodel/ivampnets.git +``` + +The following command will start the jupyter notebook server: +```bash +jupyter notebook +``` + +Your browser should pop up pointing to a list of notebooks once you navigate into the repository directory. If it's the wrong browser, add for example `--browser=firefox` or copy and paste the URL into the browser of your choice. + +The typical install time ranges from 5 minutes for conda-users to 20 minutes if conda has to be set up from scratch. diff --git a/SynaptotagminC2A.ipynb b/SynaptotagminC2A.ipynb index 0dccd13..86d67f6 100644 --- a/SynaptotagminC2A.ipynb +++ b/SynaptotagminC2A.ipynb @@ -3,6 +3,7 @@ { "cell_type": "code", "execution_count": null, + "id": "42847660", "metadata": {}, "outputs": [], "source": [ @@ -22,6 +23,7 @@ }, { "cell_type": "markdown", + "id": "43117557", "metadata": {}, "source": [ "### Hyperparameters" @@ -30,12 +32,13 @@ { "cell_type": "code", "execution_count": null, + "id": "6f87cbf8", "metadata": {}, "outputs": [], "source": [ - "stride = 2\n", + "stride = 1\n", "\n", - "tau = 10//stride \n", + "tau = 100//stride \n", "\n", "output_sizes = [8,8]\n", "number_subsystems = len(output_sizes)\n", @@ -46,7 +49,7 @@ "batch_size = 20000\n", "# Which trajectory points percentage is used as validation and testing, the rest is for training\n", "valid_ratio = 0.3\n", - "test_ratio = 0.2\n", + "test_ratio = 0.0001\n", "# How many hidden layers the network chi has\n", "network_depth = 3\n", "\n", @@ -78,6 +81,7 @@ }, { "cell_type": "markdown", + "id": "d10b4cf1", "metadata": {}, "source": [ "### Load data" @@ -86,9 +90,12 @@ { "cell_type": "code", "execution_count": null, + "id": "588c7ec9", "metadata": {}, "outputs": [], "source": [ + "# data set has a total length of 184 µs with a 1 ns resolution (total of 184000 frames)\n", + "\n", "data_trajs = []\n", "hdf5_names = []\n", "loaded_data_stride = 100\n", @@ -106,6 +113,7 @@ }, { "cell_type": "markdown", + "id": "51a0e9fd", "metadata": {}, "source": [ "### Define dataset" @@ -114,6 +122,7 @@ { "cell_type": "code", "execution_count": null, + "id": "f45ee7fb", "metadata": {}, "outputs": [], "source": [ @@ -125,6 +134,7 @@ { "cell_type": "code", "execution_count": null, + "id": "22c7b97e", "metadata": {}, "outputs": [], "source": [ @@ -136,6 +146,7 @@ }, { "cell_type": "markdown", + "id": "9d6acabf", "metadata": {}, "source": [ "### Define networks" @@ -144,6 +155,7 @@ { "cell_type": "code", "execution_count": null, + "id": "634d4b2b", "metadata": {}, "outputs": [], "source": [ @@ -180,6 +192,7 @@ }, { "cell_type": "markdown", + "id": "ea28cbbb", "metadata": {}, "source": [ "### Create iVAMPnets estimator" @@ -188,6 +201,7 @@ { "cell_type": "code", "execution_count": null, + "id": "05aabb2d", "metadata": {}, "outputs": [], "source": [ @@ -197,6 +211,7 @@ { "cell_type": "code", "execution_count": null, + "id": "0325caf5", "metadata": {}, "outputs": [], "source": [ @@ -205,6 +220,7 @@ }, { "cell_type": "markdown", + "id": "967887b4", "metadata": {}, "source": [ "### Plot mask before training" @@ -213,6 +229,7 @@ { "cell_type": "code", "execution_count": null, + "id": "dba6b102", "metadata": {}, "outputs": [], "source": [ @@ -222,6 +239,7 @@ }, { "cell_type": "markdown", + "id": "f03dd31d", "metadata": {}, "source": [ "### Create data loader" @@ -230,6 +248,7 @@ { "cell_type": "code", "execution_count": null, + "id": "06a08456", "metadata": {}, "outputs": [], "source": [ @@ -241,6 +260,7 @@ }, { "cell_type": "markdown", + "id": "c8a820a0", "metadata": {}, "source": [ "### Create a tensorboard writer to observe performance during training" @@ -249,10 +269,11 @@ { "cell_type": "code", "execution_count": null, + "id": "7038f8fc", "metadata": {}, "outputs": [], "source": [ - "tensorboard_installed = True\n", + "tensorboard_installed = False\n", "if tensorboard_installed:\n", " from torch.utils.tensorboard import SummaryWriter\n", " writer = SummaryWriter(log_dir='./runs/Syt/')\n", @@ -264,6 +285,7 @@ }, { "cell_type": "markdown", + "id": "229a10ff", "metadata": {}, "source": [ "### Fit the model on the training data" @@ -272,6 +294,7 @@ { "cell_type": "code", "execution_count": null, + "id": "bfaff88e", "metadata": {}, "outputs": [], "source": [ @@ -290,6 +313,7 @@ }, { "cell_type": "markdown", + "id": "474445e3", "metadata": {}, "source": [ "### Plot training and validation scores" @@ -298,6 +322,7 @@ { "cell_type": "code", "execution_count": null, + "id": "8a16599e", "metadata": {}, "outputs": [], "source": [ @@ -310,6 +335,7 @@ }, { "cell_type": "markdown", + "id": "2e65c886", "metadata": {}, "source": [ "### Plot the mask after training" @@ -318,6 +344,7 @@ { "cell_type": "code", "execution_count": null, + "id": "66a61989", "metadata": {}, "outputs": [], "source": [ @@ -327,6 +354,7 @@ { "cell_type": "code", "execution_count": null, + "id": "8437a367", "metadata": {}, "outputs": [], "source": [ @@ -336,6 +364,7 @@ }, { "cell_type": "markdown", + "id": "f01dfcd4", "metadata": {}, "source": [ "### Finally train without noise" @@ -344,6 +373,7 @@ { "cell_type": "code", "execution_count": null, + "id": "92c01b2a", "metadata": {}, "outputs": [], "source": [ @@ -357,6 +387,7 @@ { "cell_type": "code", "execution_count": null, + "id": "ddc0b3ab", "metadata": {}, "outputs": [], "source": [ @@ -369,6 +400,7 @@ }, { "cell_type": "markdown", + "id": "29e2875c", "metadata": {}, "source": [ "### Estimate implied timescales" @@ -377,6 +409,7 @@ { "cell_type": "code", "execution_count": null, + "id": "036bd72b", "metadata": {}, "outputs": [], "source": [ @@ -387,7 +420,7 @@ "N_trajs = len(dataset.trajectories)\n", "indexes_traj = np.arange(N_trajs)\n", "n_val = int(N_trajs * percentage)\n", - "msmlags=np.array([1,2,4,6,10,15,20,25])\n", + "msmlags=np.array([1,2,4,6,10,15,20,25])*10\n", "for run in range(runs):\n", " for tau_i in msmlags:\n", " np.random.shuffle(indexes_traj)\n", @@ -400,6 +433,7 @@ { "cell_type": "code", "execution_count": null, + "id": "0e16ad8b", "metadata": {}, "outputs": [], "source": [ @@ -414,11 +448,12 @@ { "cell_type": "code", "execution_count": null, + "id": "57c8d5a3", "metadata": {}, "outputs": [], "source": [ "axes, fig = plot_protein_its(its_reorder, msmlags, ylog=True, multiple_runs=True, percent=0.9)\n", - "x_ticks = np.array([1,5,10,20,40])\n", + "x_ticks = np.array([1,5,10,20,40])*10\n", "x_ticks_labels = x_ticks*stride # for estimating the right units!\n", "y_ticks = np.array([1000,10000, 100000])/stride\n", "y_ticks_labels = y_ticks*stride/1000\n", @@ -436,7 +471,7 @@ " ax.set_yticklabels(y_ticks_labels, fontsize=14)\n", " ax.tick_params(direction='out', length=6, width=2, colors='k',\n", " grid_color='k', grid_alpha=0.5)\n", - " ax.set_xlim(1,25)\n", + " ax.set_xlim(10,250)\n", " ax.set_ylim(0.01*1000, 200*1000)\n", " # fig.savefig('./Syt_its.pdf', bbox_inches='tight')\n", "\n", @@ -446,6 +481,27 @@ { "cell_type": "code", "execution_count": null, + "id": "9df5595e", + "metadata": {}, + "outputs": [], + "source": [ + "# reproduces Fig. 5b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e6ec4ad", + "metadata": {}, + "outputs": [], + "source": [ + "# ivampnet.save_params('./Syt_params')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f51b21f6", "metadata": {}, "outputs": [], "source": [] @@ -467,7 +523,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.8.8" } }, "nbformat": 4, diff --git a/Toymodel_2Systems.ipynb b/Toymodel_2Systems.ipynb index 30fbdce..3e37187 100644 --- a/Toymodel_2Systems.ipynb +++ b/Toymodel_2Systems.ipynb @@ -110,7 +110,10 @@ "metadata": {}, "outputs": [], "source": [ + "# training data with 100000 steps\n", "hidden_state_traj, observable_traj = toymodel.generate_traj(100000)\n", + "\n", + "# validation data with 10000 steps\n", "hidden_state_traj_valid, observable_traj_valid = toymodel.generate_traj(10000)" ] }, @@ -293,7 +296,7 @@ "metadata": {}, "outputs": [], "source": [ - "tensorboard_installed = True\n", + "tensorboard_installed = False\n", "if tensorboard_installed:\n", " from torch.utils.tensorboard import SummaryWriter\n", " writer = SummaryWriter('./runs/Toy2/')\n", @@ -323,6 +326,16 @@ " print('The model does not seem to be converged to an independent solution!')" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "edcbc766", + "metadata": {}, + "outputs": [], + "source": [ + "# execution time (cpu): ~ 30 sec" + ] + }, { "cell_type": "markdown", "id": "375d23fc", @@ -363,6 +376,16 @@ "plot_mask(mask)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ed8483a", + "metadata": {}, + "outputs": [], + "source": [ + "# reproduces Fig. 3b (or permutation of it)" + ] + }, { "cell_type": "markdown", "id": "f0935fa5", @@ -442,11 +465,21 @@ "from examples import plot_eigfuncs\n", "plot_eigfuncs(model, val_data)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d8049c4", + "metadata": {}, + "outputs": [], + "source": [ + "# reproduces Fig. 3c (possibly with permutation of state assignments)" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -460,7 +493,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.8" } }, "nbformat": 4,