From c52d1f42bc455d745b19c9d0eb8725d71cc5de47 Mon Sep 17 00:00:00 2001 From: thalassemia Date: Sat, 20 Jul 2024 23:30:31 +0800 Subject: [PATCH] Fix some documentation typos --- doc/composites.rst | 6 +-- doc/experiments.rst | 52 ++++++++++++--------- doc/index.rst | 4 +- doc/output.rst | 40 +++++++++-------- doc/processes.rst | 29 ++++++------ doc/stores.rst | 54 +++++++++++----------- doc/workflows.rst | 55 +++++++++++++---------- ecoli/composites/ecoli_master.py | 15 +++---- ecoli/experiments/ecoli_engine_process.py | 10 +++-- ecoli/experiments/ecoli_master_sim.py | 1 + ecoli/processes/engine_process.py | 10 ++--- ecoli/processes/global_clock.py | 6 ++- ecoli/variants/new_gene_internal_shift.py | 13 +++--- 13 files changed, 156 insertions(+), 139 deletions(-) diff --git a/doc/composites.rst b/doc/composites.rst index b47ef765e..73083f264 100644 --- a/doc/composites.rst +++ b/doc/composites.rst @@ -3,7 +3,7 @@ Composites ========== :py:class:`~ecoli.composites.ecoli_master.Ecoli` is a so-called composer -that is responsible for aggregating all the Processes, Steps, topologies, +that is responsible for aggregating Processes, Steps, topologies, and the flow for the Steps into a unified "composite" model that vivarium-core is able to run. Unlike a typical Vivarium composer which simply collects all these pieces, the :py:class:`ecoli.composites.ecoli_master.Ecoli` composer @@ -28,7 +28,7 @@ The :py:meth:`~ecoli.composites.ecoli_master.Ecoli.generate_processes_and_steps` method of the :py:class:`~ecoli.composites.ecoli_master.Ecoli` composer is responsible for creating these two Steps, the :py:class:`~ecoli.processes.allocator.Allocator` steps sandwiched between them in each execution layer, and the -:py:class:`~ecoli.processes.unique_update.UniqueUpdate` Steps the run at the very end +:py:class:`~ecoli.processes.unique_update.UniqueUpdate` Steps that run at the very end of each execution layer. It is also responsible for updating the flow to arrange these Steps in the order described in :ref:`implementation`. As an end-user, all you have to do to add a new partitioned process is ensure that it inherits from @@ -91,7 +91,7 @@ to visualize these updates. .. warning:: This feature should only be turned for debugging purposes and - only when using the in-memory emitter (``timeseries``). + only when using the in-memory emitter (see :ref:`ram_emitter`). ------------- Initial State diff --git a/doc/experiments.rst b/doc/experiments.rst index c09812ff8..7e1b3d0b7 100644 --- a/doc/experiments.rst +++ b/doc/experiments.rst @@ -6,10 +6,20 @@ Experiments interface for configuring and running single-cell simulations. We refer to simulations as experiments, and all simulations (or batches of simulations run in a single workflow, see :ref:`/workflows.rst`) are -identified via a unique experiment ID. If data is being persisted to -disk (i.e. not using ``timeseries`` in-memory emitter), then simulations -or workflows with the same experiment ID will overwrite data from any past -simulations or workflows with the same experiment ID. +identified via a unique experiment ID. + +.. warning:: + If data is being persisted to disk (see :ref:`parquet_emitter`), simulations + or workflows will overwrite data from any past simulations or workflows with + the same experiment ID. + +When running workflows with :py:mod:`runscripts.workflow` (see :ref:`/workflows.rst`), +users are prevented from accidentally overwriting data by ``nextflow``, the software +used to run the workflow. Specifically, nextflow generates an HTML execution report +in the output folder for a given experiment ID (see :ref:`output`) +and will refuse to run another workflow with the same experiment ID unless +that execution report is renamed, moved, or deleted. + .. _sim_config: @@ -43,25 +53,26 @@ JSON Config Files The :py:class:`~ecoli.experiments.ecoli_master_sim.EcoliSim` class relies upon the helper :py:class:`~ecoli.experiments.ecoli_master_sim.SimConfig` class to load configuration options from JSON files and merge them with options specified via -the command line. The configuration options are always loaded in the following order: +the command line. The configuration options are always loaded in the following order, +with options loaded later on overriding those from earlier sources: #. The options in the default JSON config file (located at :py:data:`~ecoli.experiments.ecoli_master_sim.SimConfig.default_config_path`) #. The options in the JSON config file specified via ``--config`` in the command line. -#. The other options specified via the command line. +#. The options specified via the command line. In most cases, configuration options that appear in more than one -of the above sources are successively overriden. The sole exceptions -are configuration options listed in +of the above sources are successively overriden in their entirety. The sole +exceptions are configuration options listed in :py:attr:`~ecoli.experiments.ecoli_master_sim.LIST_KEYS_TO_MERGE`. These -options hold list values that are concatenated with one another instead -of being overriden. +options hold lists of values that are concatenated with one another instead +of being wholly overriden. Notice that the options in the default JSON config file are always loaded first. This means that if you would like to run a simulation or workflow that leaves some of these options alone, you can simply omit those options -from the JSON config file that you create and pass to the runscript +from the JSON config file that you create and pass to your runscript of choice via ``--config``. Below is an annotated copy of the default simulation-related configuration @@ -82,7 +93,7 @@ documented in :ref:`/workflows.rst`. # String that uniquely identifies simulation (or workflow if passed # as input to runscripts/workflow.py). Avoid special characters as we # quote experiment IDs using urlparse.parse.quote_plus, which may make - # experiment IDs with special characters hard to decipher. + # experiment IDs with special characters hard to deciphe later. "experiment_id": "experiment_id_one" # Whether to append date and time to experiment ID in the following format # experiment_id_%d-%m-%Y_%H-%M-%S. @@ -91,7 +102,7 @@ documented in :ref:`/workflows.rst`. "description": "", # Whether to display vivarium-core progress bar "progress_bar" : true, - # Path to pickle file output by parameter calculator (runscripts/parca.py). + # Path to pickle file output from parameter calculator (runscripts/parca.py). # Only used for single sim run with ecoli/experiments/ecoli_master_sim.py. # Ignored when run with runscripts/workflow.py because each simulation is # automatically run with the appropriate variant/baseline simulation data. @@ -101,8 +112,8 @@ documented in :ref:`/workflows.rst`. # to Parquet files on disk (good for workflows and more in-depth analyses) "emitter" : "timeseries", # If choosing "parquet" emitter, must provide "out_dir" with path (relative - # or absolute) to output folder or "out_uri" with URI for Google Cloud Storage - # bucket. ONLY CHOOSE ONE. + # or absolute) to output folder OR "out_uri" with URI for Google Cloud Storage + # bucket. Only provide one of the above. "emitter_arg": {"out_dir": "out"}, # See API documentation on vivarium-core for vivarium.core.engine.Engine. # Can usually leave as false. @@ -115,7 +126,7 @@ documented in :ref:`/workflows.rst`. "log_updates" : false, # Controls output format for ecoli.experiments.ecoli_master_sim.EcoliSim.query. # Should only be used if choosing "timeseries" emitter. See API documentation - # for the function above for more information. + # for the query function for more information. "raw_output" : true, # Initial seed used to generate the seeds that are used to initialize # the psuedorandom number generators in the model. Only used for single @@ -345,9 +356,9 @@ documented in :ref:`/workflows.rst`. } } -Here are some general rules to remember when writing JSON files: +Here are some general rules to remember when writing your own JSON config files: -- String must be enclosed in double quotes (not single quotes) +- Strings must be enclosed in double quotes (not single quotes) - Booleans are lowercase - None values are written as (unquoted) ``null`` - Trailing commas are not allowed @@ -359,7 +370,7 @@ Output ------ If ``emitter`` was set to ``parquet``, then folders containing the simulation output are -created as described in :ref:`/output.rst`. +created as described in :ref:`parquet_emitter`. If ``division`` is set to True, :py:mod:`~ecoli.experiments.ecoli_master_sim` will save the initial states of the two daughter cells resulting from cell division @@ -412,8 +423,7 @@ by setting ``_emit`` to ``True``. Vivarium includes internal checks to ensure that all ports connected to a store give the same or compatible (no conflicting keys) schemas for that store. This means that if you would like to override the schema for a store with many - connecting ports, you will likely need to override the ports schemas for all - relevant ports. + connecting ports, you will need to override the schemas for all the relevant ports. ------------------ Colony Simulations diff --git a/doc/index.rst b/doc/index.rst index 2b1cb609f..fa6036616 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -2,9 +2,9 @@ Welcome to Vivarium *E. coli*'s documentation! ============================================== Vivarium *E. coli* is a port of the |text|_ to the `Vivarium framework `_. -For more scientific details about the model, refer to the +For more scientific modeling details, refer to the `documentation `_ -for the model as well its corresponding publication +for the original model as well its corresponding publication (`10.1126/science.aav3751 `_). This website covers how the model was implemented using Vivarium and describes the user interface for developing and running the model. We recommend new users read through the sections below in order. diff --git a/doc/output.rst b/doc/output.rst index 03b84be85..310befcf8 100644 --- a/doc/output.rst +++ b/doc/output.rst @@ -17,11 +17,11 @@ to that store. By default, we always emit data for: - Bulk molecules store located at ``("bulk",)``: The :py:func:`~ecoli.library.schema.numpy_schema` helper function that we use - to create the schema for ports to bulk and unique molecule stores automatically + to create the schema for ports to the bulk store automatically sets ``_emit`` to True when the ``name`` argument is ``bulk``. - Listeners located at ``("listeners",)``: The :py:func:`~ecoli.library.schema.listener_schema` helper function that we use - to create the schema for ports to stores located somewhere in the store hierarchy + to create the schema for ports to stores located somewhere in the hierarchy under the ``listener`` store automatically sets ``_emit`` to True .. _serializing_emits: @@ -45,6 +45,8 @@ For details about reading data back after it has been saved, refer to :ref:`ram_read` for the in-memory data format and :ref:`parquet_read` for the persistent storage format. +.. _ram_emitter: + ----------------- In-Memory Emitter ----------------- @@ -90,6 +92,8 @@ of the :py:class:`~vivarium.core.registry.Serializer` instance whose :py:meth:`~vivarium.core.registry.Serializer.can_deserialize` method returns True on the data to deserialize. +.. _parquet_emitter: + --------------- Parquet Emitter --------------- @@ -108,19 +112,19 @@ In Hive partitioning, certain keys in data are used to partition the data into f In the vEcoli Parquet emitter, the keys used for this purpose are the experiment ID, variant index, lineage seed (initial seed for cell lineage), generation, and agent ID. These keys uniquely identify a single cell simulation, meaning each simulation process -will write data to its own folder in final output with a path like:: +will write data to its own folder in the final output with a path like:: experiment_id={}/variant={}/lineage_seed={}/generation={}/agent_id={} This allows workflows that run simulations with many variant simulation data objects, lineage seeds, generations, and agent IDs to all write data to the same main output -folder without overwriting any data. +folder without simulations overwriting one another. Parquet Files ============= Because Parquet is a tabular file format (think in terms of columns like a Pandas -DataFrame), additional serialization steps must be taken after the data to save +DataFrame), additional serialization steps must be taken after the emit data has been converted to JSON format in accordance with :ref:`serializing_emits`. The Parquet emitter (:py:class:`~ecoli.library.parquet_emitter.ParquetEmitter`) first calls :py:func:`~ecoli.library.parquet_emitter.flatten_dict` in order to @@ -200,11 +204,11 @@ Schemas constructed with the :py:func:`~ecoli.library.schema.listener_schema` he function can populate this metdata concisely. These metadata values are compiled for all stores in the simulation state hierarchy by :py:meth:`~ecoli.experiments.ecoli_master_sim.EcoliSim.get_output_metadata`. In the -saved configuration Parquet file, the metadata values will be located under +saved configuration Parquet file, the metadata values will be located in columns with names equal to the double-underscore concatenated store path prefixed by ``output_metadata__``. For convenience, the :py:func:`~ecoli.library.parquet_emitter.get_field_metadata` can be used in -analysis scripts to read back this metadata. +analysis scripts to read this metadata. ``history`` ----------- @@ -213,13 +217,13 @@ Each simulation will save Parquet files containing serialized simulation output inside its corresponding Hive partition under the ``history`` folder. The columns in these Parquet files come from flattening the hierarchy of emitted stores. To leverage Parquet's columnar compression and efficient reading, we batch many time steps worth -of emits into a temporary file before reading them into a +of emits into a temporary newline-delimited JSON file before reading them into a `PyArrow `_ table where each row -contains the column values for a single time step. This PyArrow table can then be +contains the column values for a single time step. This PyArrow table is then written to a Parquet file named ``{batch size * number of batches}.pq`` (e.g. ``400.pq``, ``800.pq``, etc. for a batch size of 400). The default batch size of 400 has been tuned for our current model but can be adjusted via ``emits_to_batch`` -under the ``emitter_arg`` option in configuration JSONs. +under the ``emitter_arg`` option in a configuration JSON. .. _parquet_read: @@ -240,19 +244,19 @@ to read data using DuckDB. These include: ``config_sql`` that reads data from Parquet files with filters applied when run using :py:mod:`runscripts.analysis`. - :py:func:`~ecoli.library.parquet_emitter.num_cells`: Quickly get a count of - the number of cells worth of data included in a SQL query + the number of cells whose data is included in a SQL query - :py:func:`~ecoli.library.parquet_emitter.skip_n_gens`: Add a filter to an SQL query to skip the first N generations worth of data -- :py:func:`~ecoli.library.parquet_emitter.ndlist_to_ndarray`: Convert a Parquet +- :py:func:`~ecoli.library.parquet_emitter.ndlist_to_ndarray`: Convert a PyArrow column of nested lists into a N-D Numpy array - :py:func:`~ecoli.library.parquet_emitter.ndarray_to_ndlist`: Convert a N-D Numpy - array into a Parquet column of nested lists + array into a PyArrow column of nested lists - :py:func:`~ecoli.library.parquet_emitter.ndidx_to_duckdb_expr`: Get a DuckDB SQL expression which can be included in a ``SELECT`` statement that uses Numpy-style indexing to retrieve values from a nested list Parquet column - :py:func:`~ecoli.library.parquet_emitter.named_idx`: Get a DuckDB SQL expression - which can be included in a ``SELECT`` statement that extracts certain indices - of values from a nested list Parquet column and returns them as new named columns + which can be included in a ``SELECT`` statement that extracts values at certain indices + from each row of a nested list Parquet column and returns them as individually named columns - :py:func:`~ecoli.library.parquet_emitter.get_field_metadata`: Read saved store metadata (see :ref:`configuration_parquet`) - :py:func:`~ecoli.library.parquet_emitter.get_config_value`: Read option from @@ -264,7 +268,7 @@ to read data using DuckDB. These include: to large to read into memory all at once). .. warning:: - Parquet lists are 1-indexed. The :py:func:`~ecoli.library.parquet_emitter.ndidx_to_duckdb_expr` + Parquet lists are 1-indexed. :py:func:`~ecoli.library.parquet_emitter.ndidx_to_duckdb_expr` and :py:func:`~ecoli.library.parquet_emitter.named_idx` automatically add 1 to user-supplied indices. @@ -274,7 +278,7 @@ Construct SQL Queries The true power of DuckDB is unlocked when SQL queries are iteratively constructed. This can be accomplished in one of two ways: -- For simpler queries, you can wrap a complete DuckDB SQL expression in parenthesis to use as +- For simpler queries, you can wrap a complete DuckDB SQL expression in parentheses to use as the input table to another query. For example, to calculate the average cell and dry mass for over all time steps for all cells accessible to an analysis script: @@ -286,7 +290,7 @@ accomplished in one of two ways: ) ) - In this case, ``history_sql`` can be slotted in programmatically using an f-string. + ``history_sql`` can be slotted in programmatically using an f-string. - For more advanced, multi-step queries, you can use `common table expressions `_ (CTEs). For example, to run the same query above but first averaging over all time steps diff --git a/doc/processes.rst b/doc/processes.rst index 5c6cfdc67..248c9d3de 100644 --- a/doc/processes.rst +++ b/doc/processes.rst @@ -20,12 +20,12 @@ Registration ------------ In order for a process to be recognized by our main simulation runscript -(:py:mod:`~ecoli.experiments.ecoli_master_sim`) by name, it must register -itself in a few key places. First, a process must be registered in the +(:py:mod:`~ecoli.experiments.ecoli_master_sim`) by name, it must be registered +in a few key places. First, a process must be registered in the :py:data:`~vivarium.core.registry.process_registry` in the ``ecoli/processes/__init__.py`` file. -Second, it should register its topology in :py:data:`~ecoli.processes.registries.topology_registry`. +Second, its topology must be registered in :py:data:`~ecoli.processes.registries.topology_registry`. This is usally accomplished by having the following lines at the top of the process file:: @@ -66,7 +66,7 @@ instead of :py:class:`~vivarium.core.process.Process` or :py:class:`~vivarium.co During sim initialization, :py:meth:`~ecoli.composites.ecoli_master.Ecoli.generate_processes_and_steps` uses each :py:class:`~ecoli.processes.partition.PartitionedProcess` to create two processes that inherit from :py:class:`~vivarium.core.process.Step` and have the required -:py:meth:`~vivarium.core.process.Process.next_update` methods: a +:py:meth:`~vivarium.core.process.Process.next_update` method: a :py:class:`~ecoli.processes.partition.Requester` and an :py:class:`~ecoli.processes.partition.Evolver`. These processes share an initialized :py:class:`~ecoli.processes.partition.PartitionedProcess` instance, meaning @@ -106,30 +106,27 @@ the ``time_step`` key in the parameter dictionary. However, most processes in vEcoli inherit from :py:class:`~vivarium.core.process.Step` and not :py:class:`~vivarium.core.process.Process`. Instead of running with a certain time step, Steps, by default, are run at the end of every time -step where at least one :py:class:`~vivarium.core.process.Process` -ran. Since we only used Steps instead of Processes in order to enforce -a certain execution order within time steps, we need to somehow allow -Steps to run with something like a time step. +step where at least one :py:class:`~vivarium.core.process.Process` ran. -To achieve this, we: +To change this to allow our Steps to run with a time step like a +Process, we: #. Added a top-level store to hold the global simulation time step at ``("timestep",)``. #. Added a top-level store to hold the global time at ``("global_time",)`` with a - default value of this store is 0. + default value of 0. #. Added a store for each process located at ``("next_update_time", "process_name")`` which has a default value of ``("timestep",)``. #. Added logic to the :py:meth:`~vivarium.core.process.Process.next_update` - methods (or :py:meth:`~ecoli.processes.partition.PartitionedProcess.calculate_request` - or :py:meth:`~ecoli.processes.partition.PartitionedProcess.evolve_state` + methods (:py:meth:`~ecoli.processes.partition.PartitionedProcess.evolve_state` for partitioned processes) to increment ``("next_update_time", "process_name")`` by ``("timestep",)`` every time the Step is run. #. Added a :py:class:`~ecoli.processes.global_clock.GlobalClock` process that calculates the smallest difference between the current ``("global_time",)`` and each Step's ``("next_update_time", "process_name")``. This process has a custom :py:meth:`~vivarium.core.process.Process.calculate_timestep` method - to tell vivarium-core to only run this process after this minimum time - difference has elapsed in vivarium-core's internal simulation clock. At that - time, this process advances ``("global_time",)`` by that minimum time difference. + to tell vivarium-core to only run this process after its internal simulation + clock reaches the soonest update time for another process. At that + time, this process advances ``("global_time",)`` to match the internal clock. Taken together, these actions guarantee that we never accidentally skip over a Step's scheduled update time and also that our manual time stepping scheme stays perfectly in sync with vivarium-core's built-in @@ -148,7 +145,7 @@ Steps in the same ``("timestep",)`` store, a Process or Step only needs to modif this store for all Steps to register this change. Conversely, say we wanted to have each Step run with its own time step instead of a global time step. We could implement this by simply changing the topologies of each Step to connect -to a dedicated time step store ``("timestep", "process_name")``, unlinking time steps +to a dedicated time step store ``("timestep", "process_name")``, unlinking the time steps for each Step. .. note:: diff --git a/doc/stores.rst b/doc/stores.rst index d9e3d6eb6..eb75dc762 100644 --- a/doc/stores.rst +++ b/doc/stores.rst @@ -49,8 +49,8 @@ prefaced by a series of relevant attributes including: documentation to see the format that updates to the store must be given in. :Divider: Function used to split the store during cell division :Serializer: Instance of :py:class:`vivarium.core.registry.Serializer` - used to serialize store data before being emitted -:Schema: Helper function to create store schema in ``ports_schema`` methods + used to serialize store data before being emitted (see :ref:`serializing_emits`) +:Schema: Helper function to create store schema in process ``ports_schema`` methods :Helpers: Other useful helper functions .. WARNING:: @@ -80,13 +80,13 @@ molecules store holds a `structured Numpy array `_ with the following named fields: - 1. ``id`` (:py:class:`str`): Names of bulk molecules as pulled from `EcoCyc `_ + 1. ``id`` (:py:class:`str`): Names of bulk molecules pulled from `EcoCyc `_ Each end with a bracketed "location tag" (e.g. ``[c]``) containing - one of the abbreviations defined in the + one of the abbreviations defined in the ``reconstruction/ecoli/flat/compartments.tsv`` file (see `Cell Component Ontology `_) 2. ``count`` (:py:attr:`numpy.int64`): Counts of bulk molecules - Note that the :py:meth:`~ecoli.processes.partition.PartitionedProcess.evolve_state` + Note that the :py:meth:`~ecoli.processes.partition.PartitionedProcess.evolve_state` method of :py:class:`~ecoli.processes.partition.PartitionedProcess` does not see the full structured array with named fields through its bulk port and instead sees a 1D array of partitioned counts from the allocator (see :ref:`partitioning`). @@ -98,11 +98,12 @@ with the following named fields: Initialization ============== To create the initial value for this store, the model will go through -the following three options in order: +the following three options in order of preference: 1. Load custom initial state Set ``initial_state`` option for :py:class:`~ecoli.experiments.ecoli_master_sim.EcoliSim` + (see :ref:`json_config`) 2. Load from saved state JSON Set ``initial_state_file`` option for @@ -133,7 +134,7 @@ This setup has a potential problem: two processes may both decide to deplete the count of the same molecule, resulting in a final count that is negative. To prevent this from happening, the model forces processes to first request counts of bulk molecules via special process-specific ``request`` stores. These -stores are read by a special allocator process +stores are read by an allocator process (:py:class:`~ecoli.processes.allocator.Allocator`). The allocator process then divides the bulk molecules so that each process sees a functional count proportional to its request. @@ -153,7 +154,7 @@ molecule binding and complexation events occur on timescales much shorter than t default 1 second simulation timestep, :py:class:`~ecoli.processes.equilibrium.Equilibrium` and :py:class:`~ecoli.processes.two_component_system.TwoComponentSystem` must wait for :py:class:`~ecoli.processes.tf_unbinding.TfUnbinding` to update -the simulation state by freeing currently bound transcription factors. This allows +the simulation state by freeing currently bound transcription factors. This gives all transcription factors a chance to form complexes or participate in other reactions, better reflecting the transient binding dynamics of real cells. :py:class:`~ecoli.processes.tf_binding.TfBinding` must wait for these @@ -162,7 +163,7 @@ new active transcription factor counts, must wait for the counts of transcription factors bound to promoters, and so on. To allow processes to run in a pre-specified order within -each timestep, we can make use of a special subclass of the typical Vivarium +each timestep, we can make use of a subclass of the typical Vivarium :py:class:`~vivarium.core.process.Process` class: :py:class:`~vivarium.core.process.Step`. Almost all "processes" in the model are actually instances of :py:class:`~vivarium.core.process.Step`. These Steps @@ -225,30 +226,28 @@ a :py:class:`~ecoli.processes.partition.Requester` and an :py:class:`~ecoli.processes.partition.Evolver`. For each execution layer in the ``flow`` given to :py:class:`~ecoli.experiments.ecoli_master_sim.EcoliSim`, :py:class:`~ecoli.composites.ecoli_master.Ecoli` will create Requesters, Evolvers, -and other required Steps arranged to be executed in the following order: +and other required Steps and arrange them to be executed in the following order: 1. Requesters: - Each calls the + Each calls the :py:meth:`~ecoli.processes.partition.PartitionedProcess.calculate_request` - method of a :py:class:`~ecoli.processes.partition.PartitionedProcess` + method of a :py:class:`~ecoli.processes.partition.PartitionedProcess` in said layer and writes its requests to a process-specific ``request`` store 2. Allocator: Once all Requesters in said layer have finished writing their requests, an - instance of :py:class:`~ecoli.processes.allocator.Allocator` + instance of :py:class:`~ecoli.processes.allocator.Allocator` reads all the written ``request`` stores and proportionally allocates bulk molecules to processes, writing allocated counts to process-specific ``allocate`` stores 3. Evolvers: Each swaps the Numpy structured array of unpartitioned bulk counts in the - ``bulk`` port with the 1D array of allocated counts in its corresponding + ``bulk`` port with the 1D array of allocated counts in the corresponding ``allocate`` store, calls the :py:meth:`~ecoli.processes.partition.PartitionedProcess.evolve_state` method of its :py:class:`~ecoli.processes.partition.PartitionedProcess`, - updates the bulk molecule counts, and sends unique molecule updates - to be accumulated by each unique molecule updater - (see :py:class:`~ecoli.library.schema.UniqueNumpyUpdater`) + and returns updates to the bulk molecule counts and unique molecule stores 4. Unique updater: An instance of @@ -277,11 +276,9 @@ processes access to non-partitioned counts, an additional port is added to their ``ports_schema`` methods and topologies that is also connected to the bulk molecules store. By convention, this port is called ``bulk_total`` to differentiate it from the partitioned ``bulk`` port. As noted in :ref:`implementation`, -Evolvers overwrite port named ``bulk`` with the allocated bulk counts. Due to being +Evolvers overwrite the port named ``bulk`` with the allocated bulk counts. Due to being named ``bulk_total`` instead of ``bulk``, the non-partitioned port value is left -untouched and allows the Evolver to read non-partitioned counts at will (i.e. -inside the :py:meth:`~ecoli.processes.partition.PartitionedProcess.evolve_state` method -of its associated :py:class:`~ecoli.processes.partition.PartitionedProcess`). +untouched and allows the Evolver to read non-partitioned counts at will. Indexing @@ -299,7 +296,7 @@ in :py:meth:`~ecoli.processes.polypeptide_elongation.PolypeptideElongation.calcu for an example. Though counts can be directly retrieved from the Numpy structured array (e.g. -``states["bulk"]["count"][self.ntp_idx]``), this method of access does not workflow +``states["bulk"]["count"][self.ntp_idx]``), this method of access does not work for :py:class:`~ecoli.processes.partition.Evolver` (i.e. inside the :py:meth:`~ecoli.processes.partition.PartitionedProcess.evolve_state` method) because they automatically replace the non-partitioned Numpy structured array @@ -373,7 +370,7 @@ Listeners automatically serialized to JSON by ``orjson``. For example, listeners holding values with Unum units will be serialized by :py:class:`~ecoli.library.serialize.UnumSerializer`, which is - registered in ``ecoli/__init__.py``. + registered in ``ecoli/__init__.py``. See :ref:`serializing_emits`. :Schema: :py:func:`ecoli.library.schema.listener_schema` :Helpers: None @@ -389,18 +386,17 @@ contains substores for various masses of interest, such as ``cell_mass``, Initialization ============== -Listener stores are initialized at the start of a simulation with their -default values as specified in the +Listener stores are initialized at the start of a simulation with the +default values specified in the :py:meth:`~vivarium.core.process.Process.ports_schema` methods of the processes that connect to them. Refer to :py:func:`ecoli.library.schema.listener_schema` for information about how to configure this default value as well as attach useful metadata to specific listener values. -Listener stores must contain data of the same type or data that is configured -be configured to serialized to the same type for the duration of a simulation -(``None`` allowed for null values). This is becuase the Parquet storage format -used to persist simulation output to disk is a columnar format that requires +Listener stores must contain data of the same type (or data that is serialized +to the same type) for the duration of a simulation. This is becuase the Parquet +storage format (see :ref:`parquet_emitter`) is a columnar format that requires columns to have static data types. Some leeway is allowed for ``None`` (null) values in nested types. For example, ``[]`` and ``[0]`` both work fine for a column containing 1D lists of integers. diff --git a/doc/workflows.rst b/doc/workflows.rst index 85ce925e2..95f6f21ab 100644 --- a/doc/workflows.rst +++ b/doc/workflows.rst @@ -39,9 +39,9 @@ whose path must be given via the ``sim_data_path`` configuration option to all r in ``runscripts/`` and to experiment modules in ``ecoli/experiments`` (default used by :py:mod:`runscripts.workflow` is :py:mod:`~ecoli.experiments.ecoli_master_sim`). -The code responsible for loading data from the raw flat files is orchestrated by +The code responsible for loading data from the raw flat files is contained in :py:class:`~reconstruction.ecoli.knowledge_base_raw.KnowledgeBaseEcoli`. The actual logic -of the ParCa is mostly contained with a single file: :py:mod:`~reconstruction.ecoli.fit_sim_data_1`. +of the ParCa is mostly contained within a single file: :py:mod:`~reconstruction.ecoli.fit_sim_data_1`. The main interface for running the ParCa is :py:mod:`runscripts.parca`. Configuration @@ -63,7 +63,7 @@ Configuration options for the ParCa are all located in a dictionary under the - ``new_genes``: String folder name in ``reconstruction/ecoli/flat/new_gene_data`` containing necessary flat files to add new gene(s) to the model (see templates in ``reconstruction/ecoli/flat/new_gene_data/template``). By default, ``off`` does - nothing. + nothing (no new genes). - ``debug_parca``: If True, fit only one arbitrarily-chosen transcription factor in order to speed up a debug cycle. - ``save_intermediates``: Save intermediate pickle files for each major @@ -79,8 +79,8 @@ Configuration options for the ParCa are all located in a dictionary under the - ``intermediates_directory``: Path to folder where intermediate pickle files should be saved or loaded. - ``load_intermediate``: The function name of the ParCa step to load - sim_data and cell_specs from; functions prior to and including this - will be skipped but all following functions will run. Can only be used + sim_data and cell_specs from; functions prior to and including this one + will be skipped but all subsequent functions will run. Can only be used if all ParCa steps up to and including named step were previously run successfully with ``save_intermediates`` set to True. - ``variable_elongation_transcription``: If True, enable variable elongation @@ -100,7 +100,7 @@ see how a cell responds differently when grown in different media conditions. Since most process parameters in our model come from the pickled :py:class:`~reconstruction.ecoli.simulation_data.SimulationDataEcoli` generated by the ParCa, we need an easy way to modify this object. The -:py:mod:`runscripts.create_variants` script was designed for that purpose. +:py:mod:`runscripts.create_variants` script was designed for this purpose. Template ======== @@ -183,12 +183,20 @@ combinations, each of which results in the creation of a variant of the :py:class:`~reconstruction.ecoli.simulation_data.SimulationDataEcoli` object. +When manually running :py:mod:`runscripts.create_variants` (as opposed to +running :py:mod:`runscripts.workflow`), the configuration file must also include: + +- Top-level (not under ``variants`` key) ``outdir`` option: path to directory + in which to save variant simulation data objects as pickle files +- Top-level (not under ``variants`` key) ``kb`` option: path to directory + containing ParCa output pickle files + .. _variant_output: Output ====== -These variant simulation data objects are pickled and saved in the +The generated variant simulation data objects are pickled and saved in the directory given in the ``outdir`` key of the configuration JSON. They all have file names of the format ``{index}.cPickle``, where index is an integer. The unmodified simulation data object is always @@ -257,7 +265,7 @@ it has access to. If you would like to use an analysis script with many different scopes, instead of duplicating the entire script in each analysis type folder, you can just create stub files in the appropriate folders - that simply import the ``plot`` function from a main analysis script. + that simply import the ``plot`` function from a primary analysis script. .. _analysis_config: @@ -305,24 +313,24 @@ options under the ``analysis_options`` key: - ``validation_data_path``: List of string paths to validation data pickle files (generated by ParCa). Can pass any number of paths in any order and they will be passed as is to analysis script ``plot`` functions. -- ``outdir``: Local (relative or abosolute) path to directory that serves as a prefix +- ``outdir``: Local (relative or absolute) path to directory that serves as a prefix to the ``outdir`` argument for analysis script ``plot`` functions - (see :ref:`analysis_template`) and a copy of the configuration options - used to run :py:mod:`runscripts.analysis` is saved as ``metadata.json``. + (see :ref:`analysis_template`). A copy of the configuration options + used to run :py:mod:`runscripts.analysis` is saved as ``outdir/metadata.json``. - ``n_cpus``: Number of CPU cores to let DuckDB use - ``analysis_types``: List of analysis types to run. By default (if this option is not used), all analyses provided under all the analysis type keys are run on all possible subsets of the data after applying the data filters given using ``experiment_id``, ``variant``, etc. For example, say 2 experiment IDs are given with ``experiment_id``, 2 variants with ``variant``, 2 seeds with ``lineage_seed``, - 2 generations with ``generation``, and 2 agent IDs with ``agent_id``. Analyses under - the ``multiexperiment`` key (if any) will each run once with all data passing this filter. - The ``multivariant`` analyses will each run twice, first with filtered data for one - experiment ID then with filtered data for the other. The ``multiseed`` analyses will - each run 4 times (2 exp IDs * 2 variants), the ``multigeneration`` analyses 8 times - (4 * 2 seeds), the ``multidaughter`` analyses 16 times (8 * 2 generations), and the - ``single`` analyses 32 times (16 * 2 agent IDs). If you only want to run the ``single`` - and ``multivariant`` analyses, specify ``["single", "multivariant"]`` using this option. + and 2 generations with ``generation``. Assuming no simulations failed and ``single_daughter`` + was set to True, analyses under the ``multiexperiment`` key (if any) will each run once + with all data passing this filter. ``multivariant`` analyses will each run twice, first + with filtered data for one experiment ID then with filtered data for the other. ``multiseed`` + analyses will each run 4 times (2 exp IDs * 2 variants), ``multigeneration`` analyses + 8 times (4 * 2 seeds), ``multidaughter`` analyses 16 times (8 * 2 generations), and + ``single`` analyses 16 times. If you only want to run the ``single`` and ``multivariant`` + analyses, specify ``["single", "multivariant"]`` using this option. .. _analysis_template: @@ -369,7 +377,7 @@ All analysis scripts must contain a ``plot`` function with the following signatu - Combination of ``sim_data_path``, ``variant``, ``variant_metadata_path``, and ``experiment_id`` configuration options - - Trawling directories in ``variant_data_dir`` and + - Traversing directories in ``variant_data_dir`` and matching discovered variants with experiment IDs given in ``experiment_id`` (preferred route for most use cases) @@ -413,6 +421,7 @@ the steps described above. Configuration ============= + All the previously covered configuration options also apply to the configuration JSON supplied to :py:mod:`runscripts.workflow`. Those options govern the behavior of the corresponding step in the workflow. For example, running @@ -556,7 +565,7 @@ is a list workflow behaviors enabled in our model to handle unexpected errors. - When running on Sherlock, jobs that fail with exit codes 140 (hit job limits for RAM or runtime) or 143 (job was preempted by another user) - are automatically retried up to a maximum of 3 tries. For the resources + are automatically retried up to a maximum of 3 tries. For the resource limit error code (140), Nextflow will automatically request more RAM and a higher runtime limit with each attempt: ``4 * {attempt num}`` GB of memory and ``2 * {attempt num}`` hours of runtime. See the @@ -565,7 +574,7 @@ is a list workflow behaviors enabled in our model to handle unexpected errors. them to the SLURM scheduler. Nextflow was configured to limit the rate of job sumission and job queue polling to keep these failures to a minimum. Furthermore, jobs that fail to submit are automatically - retried with a relatively long 5 minute delay to hopefully Avoid + retried with a relatively long 5 minute delay to hopefully avoid any transient scheduler issues. - Jobs that fail for any reason other than the Sherlock reasons described above are ignored. This is mainly to allow a workflow to finish running @@ -632,7 +641,7 @@ the output directory specified via ``out_dir`` or ``out_uri`` under the - ``{experiment ID}_report.html``: Contains detailed information about workflow run. Also serves to prevent users from accidentally running another workflow with the same experiment ID and overwriting data. If a user wishes to do so, - they must first rename or delete this file. + they must first rename, move, or delete this file. - ``workflow_config.json``: Configuration JSON passed to :py:mod:`runscripts.workflow`. - ``nextflow_workdirs``: Contains all working directories for Nextflow jobs. diff --git a/ecoli/composites/ecoli_master.py b/ecoli/composites/ecoli_master.py index ae2536a4f..b99c45e12 100644 --- a/ecoli/composites/ecoli_master.py +++ b/ecoli/composites/ecoli_master.py @@ -242,7 +242,7 @@ def generate_processes_and_steps( ``defaults`` attribute of the process should be used as its config. In the case of a dictionary config, the dictionary will be merged with the result of :py:meth:`~ecoli.library.sim_data.LoadSimData.get_config_by_name` - if possible, or the ``defaults`` attribute if not. + if possible, or the ``defaults`` attribute if not. * ``processes``: Mapping of all process names (:py:class:`str`) @@ -290,7 +290,7 @@ def generate_processes_and_steps( Boolean option that only matters if ``division`` is true. Adds :py:class:`~ecoli.processes.cell_division.MarkDPeriod` if true. - + * ``generations``: If not ``None`` and ``divide`` is ``True``, adds :py:class:`~ecoli.processes.cell_division.StopAfterDivision` @@ -561,14 +561,8 @@ def generate_topology(self, config: dict[str, Any]) -> dict[str, tuple[str]]: ) # Only the bulk ports should be included in the request # and allocate topologies - topology[f"{process_id}_requester"]["request"] = ( - "request", - process_id - ) - topology[f"{process_id}_evolver"]["allocate"] = ( - "allocate", - process_id - ) + topology[f"{process_id}_requester"]["request"] = ("request", process_id) + topology[f"{process_id}_evolver"]["allocate"] = ("allocate", process_id) topology[f"{process_id}_requester"]["next_update_time"] = ( "next_update_time", process_id, @@ -812,6 +806,7 @@ def ecoli_topology_plot(config=None): """Make a topology plot of Ecoli""" # Import here to avoid circular import from ecoli.experiments.ecoli_master_sim import EcoliSim, SimConfig + default_config = SimConfig() if config is not None: default_config.update_from_dict(config) diff --git a/ecoli/experiments/ecoli_engine_process.py b/ecoli/experiments/ecoli_engine_process.py index 77a2c37c3..fd040581a 100644 --- a/ecoli/experiments/ecoli_engine_process.py +++ b/ecoli/experiments/ecoli_engine_process.py @@ -1,7 +1,7 @@ -"""Composite for simulations with EngineProcess cells in an environment. +""" +Composite for simulations with EngineProcess cells in an environment. .. note:: - This composite requires a config with the spatial environment enabled. """ @@ -54,6 +54,7 @@ def initial_state(self, config): class EcoliInnerSim(Composer): """Inner composer to be used with :py:class:`~ecoli.processes.engine_process.EngineProcess.""" + defaults = { "agent_id": "0", "seed": 0, @@ -69,8 +70,8 @@ def generate(self, config=None): as the inner simulation in an EngineProcess. This requires caching the initial state generated in the course - of calling :py:meth:`~ecoli.experiments.EcoliSim.build_ecoli` and - wrapping the returned composite using :py:class:`~.EcoliInnerWrapper` + of calling :py:meth:`~ecoli.experiments.ecoli_master_sim.EcoliSim.build_ecoli` + and wrapping the returned composite using :py:class:`~.EcoliInnerWrapper` to ensure that the cached initial state is returned when the ``initial_state`` method is called on the composite. @@ -153,6 +154,7 @@ class EcoliEngineProcess(Composer): includes the :py:class:`~ecoli.processes.environment.lysis.Lysis` Step when given a non-empty ``lysis_config``. """ + defaults = { "agent_id": "0", "seed": 0, diff --git a/ecoli/experiments/ecoli_master_sim.py b/ecoli/experiments/ecoli_master_sim.py index f5118dcef..942deb78b 100644 --- a/ecoli/experiments/ecoli_master_sim.py +++ b/ecoli/experiments/ecoli_master_sim.py @@ -56,6 +56,7 @@ user-specified JSON) instead of being directly overriden. """ + class TimeLimitError(RuntimeError): """Error raised when ``fail_at_total_time`` is True and simulation reaches ``total_time``.""" diff --git a/ecoli/processes/engine_process.py b/ecoli/processes/engine_process.py index 9030d8aab..3674f6844 100644 --- a/ecoli/processes/engine_process.py +++ b/ecoli/processes/engine_process.py @@ -114,8 +114,7 @@ def _get_path_net_depth(path: tuple[str]) -> int: def cap_tunneling_paths( - topology: dict[str, Any], - outer: tuple[str, ...]=tuple() + topology: dict[str, Any], outer: tuple[str, ...] = tuple() ) -> dict[tuple[str, ...], str]: """ For ports in the inner simulation that point to stores @@ -224,7 +223,7 @@ def __init__(self, parameters=None): """ Process that wraps a Vivarium simulation by taking the following options in its parameter dictionary: - + - ``inner_composer``: a composer for the inner simulation - ``inner_composer_config``: a configuration dictionary for that composer - ``outer_composer``: a composer that can be used upon division to create a new @@ -586,7 +585,8 @@ def _inverse_update( initial_state: Any, final_state: Any, store: Store, - updater_registry_reverse: dict[Callable, str]): + updater_registry_reverse: dict[Callable, str], +): """ Given a dictionary containing the current values contained inside a potentially nested store and a dictionary containing the final values inside that same @@ -600,7 +600,7 @@ def _inverse_update( store: Store (potentially nested) that we are trying to mutate updater_registry_reverse: A mapping from updater functions to the string names they are registered as in :py:attr:`~vivarium.core.registry.updater_registry` - + Returns: Update dictionary that when used to update ``store`` by calling its (or its sub-stores) updaters, causes its values to change from ``initial_state`` to ``final_state`` diff --git a/ecoli/processes/global_clock.py b/ecoli/processes/global_clock.py index abc942a69..ca59301a0 100644 --- a/ecoli/processes/global_clock.py +++ b/ecoli/processes/global_clock.py @@ -23,8 +23,10 @@ def calculate_timestep(self, states): advances by the same amount of time and processes that do not rely on this manual time stepping stay in sync with the ones that do. """ - return min(next_update_time - states["global_time"] for next_update_time - in states["next_update_time"].values()) + return min( + next_update_time - states["global_time"] + for next_update_time in states["next_update_time"].values() + ) def next_update(self, timestep, states): """ diff --git a/ecoli/variants/new_gene_internal_shift.py b/ecoli/variants/new_gene_internal_shift.py index 34601383a..6bae9bae9 100644 --- a/ecoli/variants/new_gene_internal_shift.py +++ b/ecoli/variants/new_gene_internal_shift.py @@ -23,15 +23,15 @@ def get_new_gene_ids_and_indices( """ cistron_sim_data = sim_data.process.transcription.cistron_data.struct_array monomer_sim_data = sim_data.process.translation.monomer_data.struct_array - new_gene_cistron_ids = cast(list[str], - cistron_sim_data[cistron_sim_data["is_new_gene"]][ - "id" - ].tolist()) + new_gene_cistron_ids = cast( + list[str], cistron_sim_data[cistron_sim_data["is_new_gene"]]["id"].tolist() + ) cistron_monomer_id_dict = dict( zip(monomer_sim_data["cistron_id"], monomer_sim_data["id"]) ) new_monomer_ids = [ - cast(str, cistron_monomer_id_dict.get(cistron_id)) for cistron_id in new_gene_cistron_ids + cast(str, cistron_monomer_id_dict.get(cistron_id)) + for cistron_id in new_gene_cistron_ids ] if len(new_gene_cistron_ids) == 0: raise Exception( @@ -51,7 +51,8 @@ def get_new_gene_ids_and_indices( rna_data = sim_data.process.transcription.rna_data cistron_idx_dict = {rna[:-3]: i for i, rna in enumerate(rna_data["id"])} new_gene_indices = [ - cast(int, cistron_idx_dict.get(cistron_id)) for cistron_id in new_gene_cistron_ids + cast(int, cistron_idx_dict.get(cistron_id)) + for cistron_id in new_gene_cistron_ids ] monomer_idx_dict = {monomer: i for i, monomer in enumerate(monomer_sim_data["id"])} new_monomer_indices = [