From 3a39091059fc307a4830c521ebc416aebef6aba8 Mon Sep 17 00:00:00 2001 From: thalassemia Date: Tue, 2 Jul 2024 13:59:05 -0700 Subject: [PATCH 1/5] Type checking and linting CI --- .github/workflows/pytest.yml | 51 +++++++++++++++++-- .../analysis/antibiotics_colony/load_data.py | 24 ++++----- .../snapshot_and_hist_plot.py | 12 ++--- validation/ecoli/scripts/rnaDegRates.py | 2 +- wholecell/utils/dependency_graph.py | 8 ++- wholecell/utils/units.py | 4 +- 6 files changed, 70 insertions(+), 31 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 944cccf30..c386c3930 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -1,6 +1,6 @@ # Modified from GitHub Actions template -name: Pytest +name: Basic tests and QA # Improves reproducibility and speed env: @@ -14,7 +14,7 @@ on: branches: [master] jobs: - test: + Pytest: runs-on: ubuntu-latest strategy: matrix: @@ -30,12 +30,53 @@ jobs: pip install --upgrade pip wheel pip install numpy==1.26.4 pip install -r requirements.txt - - name: Setup - run: | - python setup.py install - name: Compile Cython components run: | make clean compile - name: Test with pytest run: | python -m pytest --cov=ecoli --durations=0 + Mypy: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11"] + steps: + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip wheel + pip install numpy==1.26.4 mypy + pip install -r requirements.txt + - name: Compile Cython components + run: | + make clean compile + - name: Mypy + run: | + mypy + Lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11"] + steps: + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip wheel + pip install numpy==1.26.4 ruff + pip install -r requirements.txt + - name: Compile Cython components + run: | + make clean compile + - name: Ruff + run: | + ruff check diff --git a/ecoli/analysis/antibiotics_colony/load_data.py b/ecoli/analysis/antibiotics_colony/load_data.py index ea87ea283..d62c4a430 100644 --- a/ecoli/analysis/antibiotics_colony/load_data.py +++ b/ecoli/analysis/antibiotics_colony/load_data.py @@ -53,23 +53,21 @@ def load_data( experiment_id=None, cpus=8, sampling_rate=2, host="10.138.0.75", port=27017 ): # Get data for the specified experiment_id - monomers = [path[-1] for path in PATHS_TO_LOAD.values() if path[0] == "monomer"] - mrnas = [path[-1] for path in PATHS_TO_LOAD.values() if path[0] == "mrna"] - inner_paths = [ - path - for path in PATHS_TO_LOAD.values() - if path[-1] not in mrnas - and path[-1] not in monomers - and path != ("total_mrna",) - ] - outer_paths = [("data", "dimensions"), ("data", "fields")] + # monomers = [path[-1] for path in PATHS_TO_LOAD.values() if path[0] == "monomer"] + # mrnas = [path[-1] for path in PATHS_TO_LOAD.values() if path[0] == "mrna"] + # inner_paths = [ + # path + # for path in PATHS_TO_LOAD.values() + # if path[-1] not in mrnas + # and path[-1] not in monomers + # and path != ("total_mrna",) + # ] + # outer_paths = [("data", "dimensions"), ("data", "fields")] for condition, seeds in EXPERIMENT_ID_MAPPING.items(): for seed, curr_experiment_id in seeds.items(): if curr_experiment_id != experiment_id: continue metadata = {condition: {seed: {}}} - # TODO: Convert to use DuckDB - raise NotImplementedError("Still need to convert to use DuckDB!") rep_data = {} with ProcessPoolExecutor(cpus) as executor: print("Deserializing data and removing units...") @@ -127,6 +125,8 @@ def main(): ) args = parser.parse_args() os.makedirs("data/colony_data/sim_dfs/", exist_ok=True) + # TODO: Convert to use DuckDB + raise NotImplementedError("Still need to convert to use DuckDB!") load_data(args.experiment_id, cpus=args.cpus) diff --git a/ecoli/analysis/antibiotics_colony/snapshot_and_hist_plot.py b/ecoli/analysis/antibiotics_colony/snapshot_and_hist_plot.py index 2d48e0483..abbdb801d 100644 --- a/ecoli/analysis/antibiotics_colony/snapshot_and_hist_plot.py +++ b/ecoli/analysis/antibiotics_colony/snapshot_and_hist_plot.py @@ -212,12 +212,12 @@ def format_tick(tick): def get_data(experiment_id, time, molecules, host, port, cpus, verbose): # Prepare molecule paths for access_counts() - monomers = [m[-1] for m in molecules if m[-2] == "monomer"] - mrnas = [m[-1] for m in molecules if m[-2] == "mrna"] - inner_paths = [ - path for path in molecules if path[-1] not in mrnas and path[-1] not in monomers - ] - outer_paths = [("data", "dimensions")] + # monomers = [m[-1] for m in molecules if m[-2] == "monomer"] + # mrnas = [m[-1] for m in molecules if m[-2] == "mrna"] + # inner_paths = [ + # path for path in molecules if path[-1] not in mrnas and path[-1] not in monomers + # ] + # outer_paths = [("data", "dimensions")] if verbose: print(f"Accessing data for experiment {experiment_id}...") diff --git a/validation/ecoli/scripts/rnaDegRates.py b/validation/ecoli/scripts/rnaDegRates.py index 03a1d5825..f9405bfe8 100644 --- a/validation/ecoli/scripts/rnaDegRates.py +++ b/validation/ecoli/scripts/rnaDegRates.py @@ -87,7 +87,7 @@ def main(): plt.plot([0, maxLine], [0, maxLine], "--r") plt.plot(model, paper, "o", markeredgecolor="k", markerfacecolor="none") plt.axis((0, 1, 0, maxLine)) - Correlation_ExpPred = np.corrcoef(model, paper)[0][1] + Correlation_ExpPred = np.corrcoef(model, paper)[0][1] # noqa: F841 plt.xlabel("RNA decay rate expected from model [1/min]") plt.ylabel("RNA decay rate from paper (Moffitt et al. 2016) [1/min]") diff --git a/wholecell/utils/dependency_graph.py b/wholecell/utils/dependency_graph.py index 4e6f71637..eff2f4e1b 100644 --- a/wholecell/utils/dependency_graph.py +++ b/wholecell/utils/dependency_graph.py @@ -29,10 +29,9 @@ class DependencyGraph(object): def __init__(self): # type: () -> None """Initialize dependencies to empty dictionary""" - self.dependencies = {} # type: dict[str, list[str]] + self.dependencies: dict[str, list[str]] = {} - def add_nodes(self, nodes): - # type: (Iterable[str]) -> None + def add_nodes(self, nodes: Iterable[str]): """Add nodes with no dependencies Arguments: @@ -41,8 +40,7 @@ def add_nodes(self, nodes): for node in nodes: self.dependencies[node] = [] - def add_dep_relation(self, a, b): - # type: (str, str) -> None + def add_dep_relation(self, a: str, b: str): """Add an edge such that a depends on b If a or b does not exist yet as a node, it will be created. diff --git a/wholecell/utils/units.py b/wholecell/utils/units.py index bb2ed855e..bebfe5ec6 100644 --- a/wholecell/utils/units.py +++ b/wholecell/utils/units.py @@ -12,8 +12,8 @@ import numpy as np # noinspection PyUnresolvedReferences -from unum.units import mol, mmol, g, h, L, fg, min, s, umol, dmol, J, K # satisfy mypy -from unum.units import * +from unum.units import mol, mmol, g, h, L, fg, min, s, umol, dmol, J, K # noqa: F401 +from unum.units import * # noqa: F403 from unum import Unum count = Unum.unit("count", mol / scipy.constants.Avogadro) From 50be203773d82bbb3afb8f9753409d35bbeec26f Mon Sep 17 00:00:00 2001 From: thalassemia Date: Tue, 2 Jul 2024 14:27:09 -0700 Subject: [PATCH 2/5] Rename wcecoli_state to json_state --- ecoli/composites/ecoli_master.py | 2 +- ecoli/composites/ecoli_spatial.py | 2 +- ecoli/experiments/ecoli_engine_process.py | 2 +- ecoli/experiments/metabolism_exchanger.py | 2 +- ecoli/experiments/metabolism_redux_sim.py | 2 +- ecoli/experiments/tet_amp_sim.py | 2 +- ecoli/library/{wcecoli_state.py => json_state.py} | 0 ecoli/processes/antibiotics/permeability.py | 2 +- migration/allocator.py | 2 +- migration/chromosome_replication.py | 2 +- migration/initial_state.py | 2 +- migration/migration_utils.py | 2 +- 12 files changed, 11 insertions(+), 11 deletions(-) rename ecoli/library/{wcecoli_state.py => json_state.py} (100%) diff --git a/ecoli/composites/ecoli_master.py b/ecoli/composites/ecoli_master.py index 9a62a2c6a..bf8dd1a9d 100644 --- a/ecoli/composites/ecoli_master.py +++ b/ecoli/composites/ecoli_master.py @@ -38,7 +38,7 @@ # state from ecoli.processes.partition import Requester, Evolver, Step, Process -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file SIM_DATA_PATH = os.path.abspath( os.path.join( diff --git a/ecoli/composites/ecoli_spatial.py b/ecoli/composites/ecoli_spatial.py index 6216e303a..581afb5f0 100644 --- a/ecoli/composites/ecoli_spatial.py +++ b/ecoli/composites/ecoli_spatial.py @@ -83,7 +83,7 @@ DiffusionNetwork, ) -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file from ecoli.library.schema import attrs, bulk_name_to_idx SIM_DATA_PATH = "reconstruction/sim_data/kb/simData.cPickle" diff --git a/ecoli/experiments/ecoli_engine_process.py b/ecoli/experiments/ecoli_engine_process.py index 4cf8a3958..4b1c61c2f 100644 --- a/ecoli/experiments/ecoli_engine_process.py +++ b/ecoli/experiments/ecoli_engine_process.py @@ -31,7 +31,7 @@ from ecoli.library.logging_tools import write_json from ecoli.library.sim_data import RAND_MAX from ecoli.library.schema import not_a_process -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file from ecoli.processes.engine_process import EngineProcess from ecoli.processes.environment.field_timeline import FieldTimeline from ecoli.processes.environment.lysis import Lysis diff --git a/ecoli/experiments/metabolism_exchanger.py b/ecoli/experiments/metabolism_exchanger.py index 42a4dd7de..d88c02fe7 100644 --- a/ecoli/experiments/metabolism_exchanger.py +++ b/ecoli/experiments/metabolism_exchanger.py @@ -13,7 +13,7 @@ # vivarium-ecoli imports from ecoli.library.sim_data import LoadSimData -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file from ecoli.composites.ecoli_master import SIM_DATA_PATH from ecoli.processes.metabolism_redux import MetabolismRedux from ecoli.processes.stubs.exchange_stub import Exchange diff --git a/ecoli/experiments/metabolism_redux_sim.py b/ecoli/experiments/metabolism_redux_sim.py index 4d45e49f7..de2bbcb01 100644 --- a/ecoli/experiments/metabolism_redux_sim.py +++ b/ecoli/experiments/metabolism_redux_sim.py @@ -11,7 +11,7 @@ # vivarium-ecoli imports from ecoli.experiments.ecoli_master_sim import EcoliSim, CONFIG_DIR_PATH -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file import numpy as np diff --git a/ecoli/experiments/tet_amp_sim.py b/ecoli/experiments/tet_amp_sim.py index 175025228..218ea1f4a 100644 --- a/ecoli/experiments/tet_amp_sim.py +++ b/ecoli/experiments/tet_amp_sim.py @@ -9,7 +9,7 @@ from ecoli.experiments.ecoli_master_sim import CONFIG_DIR_PATH, SimConfig from ecoli.library.parameters import param_store from ecoli.library.logging_tools import write_json -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file AVOGADRO = constants.N_A / units.mol diff --git a/ecoli/library/wcecoli_state.py b/ecoli/library/json_state.py similarity index 100% rename from ecoli/library/wcecoli_state.py rename to ecoli/library/json_state.py diff --git a/ecoli/processes/antibiotics/permeability.py b/ecoli/processes/antibiotics/permeability.py index c71d1b6e8..689ecb172 100644 --- a/ecoli/processes/antibiotics/permeability.py +++ b/ecoli/processes/antibiotics/permeability.py @@ -1,6 +1,6 @@ import numpy as np from ecoli.library.schema import numpy_schema, bulk_name_to_idx, counts -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file from ecoli.processes.bulk_timeline import BulkTimelineProcess from vivarium.core.emitter import timeseries_from_data from vivarium.core.engine import Engine diff --git a/migration/allocator.py b/migration/allocator.py index 1cdb0a243..86aec933f 100644 --- a/migration/allocator.py +++ b/migration/allocator.py @@ -9,7 +9,7 @@ from ecoli.processes.allocator import Allocator from migration import LOAD_SIM_DATA from migration.migration_utils import run_non_partitioned_process, recursive_compare -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file with open("data/proc_to_name.json", "r") as f: diff --git a/migration/chromosome_replication.py b/migration/chromosome_replication.py index 854f3e786..6209b19ad 100644 --- a/migration/chromosome_replication.py +++ b/migration/chromosome_replication.py @@ -4,7 +4,7 @@ from vivarium.core.engine import pf -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file from ecoli.processes.chromosome_replication import ChromosomeReplication from migration.migration_utils import get_process_state, run_and_compare diff --git a/migration/initial_state.py b/migration/initial_state.py index e9a6a867e..359ad292a 100644 --- a/migration/initial_state.py +++ b/migration/initial_state.py @@ -1,7 +1,7 @@ import numpy as np from ecoli.library.sim_data import LoadSimData, SIM_DATA_PATH_NO_OPERONS -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file def compare_states(viv_state, wc_state): diff --git a/migration/migration_utils.py b/migration/migration_utils.py index 77ce7fb2e..ba8d25e26 100644 --- a/migration/migration_utils.py +++ b/migration/migration_utils.py @@ -8,7 +8,7 @@ from vivarium.library.dict_utils import deep_merge from wholecell.utils import units -from ecoli.library.wcecoli_state import get_state_from_file +from ecoli.library.json_state import get_state_from_file from migration import LOAD_SIM_DATA, LOAD_SIM_DATA_NO_OPERONS PERCENT_ERROR_THRESHOLD = 0.05 From 6f9b46cd819fcd0f56fd55b1dcb243733ac4e9b1 Mon Sep 17 00:00:00 2001 From: thalassemia Date: Tue, 2 Jul 2024 14:27:51 -0700 Subject: [PATCH 3/5] Delete old log update and division topology logic --- ecoli/composites/ecoli_configs/__init__.py | 15 --------------- ecoli/experiments/ecoli_master_sim.py | 8 -------- 2 files changed, 23 deletions(-) diff --git a/ecoli/composites/ecoli_configs/__init__.py b/ecoli/composites/ecoli_configs/__init__.py index 5addb2a6b..7e9a6cd34 100644 --- a/ecoli/composites/ecoli_configs/__init__.py +++ b/ecoli/composites/ecoli_configs/__init__.py @@ -97,18 +97,3 @@ ) ECOLI_DEFAULT_TOPOLOGY[process] = process_topology - -# Add log_update ports if log_updates is True -if config["log_updates"]: - for process, ports in ECOLI_DEFAULT_TOPOLOGY.items(): - ECOLI_DEFAULT_TOPOLOGY[process]["log_update"] = ( - "log_update", - process, - ) - -# add division -if config["divide"]: - ECOLI_DEFAULT_TOPOLOGY["division"] = { - "variable": ("listeners", "mass", "cell_mass"), - "agents": ["..", "..", "agents"], - } diff --git a/ecoli/experiments/ecoli_master_sim.py b/ecoli/experiments/ecoli_master_sim.py index 6e5554451..bf5423e1d 100644 --- a/ecoli/experiments/ecoli_master_sim.py +++ b/ecoli/experiments/ecoli_master_sim.py @@ -607,14 +607,6 @@ def _retrieve_topology( deep_merge(process_topology, tuplify_topology(topology[process])) result[process] = process_topology - # Add log_update ports if log_updates is True - if log_updates: - for process, ports in result.items(): - result[process]["log_update"] = ( - "log_update", - process, - ) - return result def _retrieve_process_configs( From 321f52de4800fb6b3ff30e6b48b2520514bf5302 Mon Sep 17 00:00:00 2001 From: thalassemia Date: Tue, 2 Jul 2024 18:14:29 -0700 Subject: [PATCH 4/5] Basic Jenkins tests --- runscripts/create_gcp_bucket.sh | 2 - runscripts/debug/compare_pickles.py | 502 ++++++++++++++++++ runscripts/debug/diff_simouts.py | 24 + runscripts/jenkins/check-reproducibility.sh | 40 ++ .../configs/ecoli-glucose-minimal.json | 15 + .../jenkins/configs/reproducibility.json | 10 + runscripts/jenkins/setup-environment.sh | 14 + runscripts/jenkins/workflow.sh | 5 + 8 files changed, 610 insertions(+), 2 deletions(-) delete mode 100644 runscripts/create_gcp_bucket.sh create mode 100644 runscripts/debug/compare_pickles.py create mode 100644 runscripts/debug/diff_simouts.py create mode 100644 runscripts/jenkins/check-reproducibility.sh create mode 100644 runscripts/jenkins/configs/ecoli-glucose-minimal.json create mode 100644 runscripts/jenkins/configs/reproducibility.json create mode 100644 runscripts/jenkins/setup-environment.sh create mode 100644 runscripts/jenkins/workflow.sh diff --git a/runscripts/create_gcp_bucket.sh b/runscripts/create_gcp_bucket.sh deleted file mode 100644 index 9d5576ded..000000000 --- a/runscripts/create_gcp_bucket.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -gcloud storage buckets create gs://$0 --location=$1 diff --git a/runscripts/debug/compare_pickles.py b/runscripts/debug/compare_pickles.py new file mode 100644 index 000000000..37c75504c --- /dev/null +++ b/runscripts/debug/compare_pickles.py @@ -0,0 +1,502 @@ +""" +Compare two .cPickle files or all the .cPickle files in a pair of directories. +Show the differences or optionally just a count of difference lines. + +Usage (PATH is a path like 'out/manual/intermediates'): + runscripts/debug/comparePickles.py PATH1 PATH2 +""" +import argparse +from collections.abc import Mapping, Sequence +import functools +import numbers +import os +import pickle +from pprint import pformat +import re +import sys +import types +from typing import Dict + +import Bio.Seq +import numpy as np +import scipy.interpolate +import sympy +from sympy.matrices import dense +import unum + +from wholecell.utils import constants +import wholecell.utils.unit_struct_array + + +NULP = 0 # float comparison tolerance, in Number of Units in the Last Place + +# Objects with a list of attributes to compare +SPECIAL_OBJECTS = { + scipy.interpolate.CubicSpline: ['x', 'c', 'axis'], + wholecell.utils.unit_struct_array.UnitStructArray: ['struct_array', 'units'], + } + +LEAF_TYPES = ( + unum.Unum, + Bio.Seq.Seq, + sympy.Basic, + numbers.Number, + functools.partial, + types.FunctionType, + dense.MutableDenseMatrix, + wholecell.utils.unit_struct_array.UnitStructArray) + +WHITESPACE = re.compile(r'\s+') + + +class Repr(object): + '''A Repr has the given repr() string without quotes and != any other value.''' + def __init__(self, repr_): + self.repr_ = repr_ + + def __repr__(self): + return self.repr_ + + +def has_python_vars(obj): + """ + Returns true if the given object has any Python instance variables, that is + ordinary fields or compact slots. If not, it's presumably a built-in type + or extension type implemented entirely in C and Cython. + """ + return hasattr(obj, '__dict__') or hasattr(obj, '__slots__') + +def all_vars(obj): + """ + Returns a dict of all the object's instance variables stored in ordinary + fields and in compact slots. This expands on the built-in function `vars()`. + If the object implements the pickling method `__getstate__`, call that + instead to get its defining state. + """ + if hasattr(obj, '__getstate__'): + # noinspection PyCallingNonCallable + return obj.__getstate__() + + attrs = getattr(obj, '__dict__', {}) + attrs.update({key: getattr(obj, key) for key in getattr(obj, '__slots__', ())}) + return attrs + +def is_leaf(value, leaves=LEAF_TYPES): + """ + Predicate to determine if we have reached the end of how deep we want to traverse + through the object tree. + """ + if isinstance(value, (Mapping, Sequence)): + return isinstance(value, (bytes, str)) + return (callable(value) # it's callable + or isinstance(value, leaves) # it's an instance of a declared leaf type + or not has_python_vars(value)) # an object without Python instance variables + +def object_tree(obj, path='', debug=None): + """ + Diagnostic tool to inspect a complex data structure. + + Given an object, exhaustively traverse down all attributes it contains until leaves + are reached, and convert everything found into a dictionary or a list. The resulting + dictionary will mirror the structure of the original object, but instead of + attributes with values it will be a dictionary where the keys are the attribute + names. The type of the dictionarified object will be encoded under the key `!type` + which is assumed to not be in conflict with any other attributes. The result should + aid in serialization and deserialization of the object and is intended to be a + translation of a pickled object. + + Args: + obj (object): The object to inspect. + path (optional str): The root path of this object tree. This will be built upon + for each child of the current object found and reported in a value is + provided for `debug`. + debug (optional str): If provided, prints paths of the attributes encountered. + If the value is 'ALL', it will print every path. If the value is 'CALLABLE', + it will only print methods and functions it finds. + """ + + if debug == 'ALL': + print(path) + + if is_leaf(obj): + if callable(obj) and (debug == 'CALLABLE'): + print('{}: {}'.format(path, obj)) + return obj + elif isinstance(obj, Mapping): + return {key: object_tree(value, "{}['{}']".format(path, key), debug) + for (key, value) in obj.items()} + elif isinstance(obj, Sequence): + return [object_tree(subobj, "{}[{}]".format(path, index), debug) for index, subobj in enumerate(obj)] + else: + attrs = all_vars(obj) + tree = {key: object_tree(value, "{}.{}".format(path, key), debug) + for (key, value) in attrs.items()} + tree['!type'] = type(obj) + + return tree + +def size_tree(o, cutoff=0.1): + """ + Find the size of attributes in an object tree. Sizes greater than the cutoff + (in MB) will be returned for displaying. Sizes include all values contained + within an attribute (eg. a Dict will be represented by the size of all keys + and values in addition to the Dict size itself). + + TODO: double check total size vs disk size - might be missing some types + """ + + def return_val(total, value): + if total > cutoff and value: + return total, value + else: + return total, + + def get_size(o): + return sys.getsizeof(o) / 2**20 # convert to MB + + size = get_size(o) + + # special handling of leaf to get size of defining attributes + if isinstance(o, unum.Unum): + size += size_tree(o._unit)[0] + size += get_size(o._value) + return size, + + # special handling of leaf to get size of str sequence + elif isinstance(o, Bio.Seq.Seq): + size += get_size(o._data) + return size, + + # special handling of leaf, each entry is allocated the same amount of space + elif isinstance(o, wholecell.utils.unit_struct_array.UnitStructArray): + size += size_tree(o.units)[0] + n_entries = len(o.struct_array) + if n_entries: + size += get_size(o.struct_array[0]) * n_entries + return size, + + # if a special object, check predefined attributes for equality + elif type(o) in SPECIAL_OBJECTS: + sizes = {} + attrs = SPECIAL_OBJECTS[type(o)] + for attr in attrs: + subsizes = size_tree(getattr(o, attr), cutoff) + size += subsizes[0] + if subsizes[0] > cutoff: + formatted = float('{:.2f}'.format(subsizes[0])) + if len(subsizes) == 1: + val = formatted + else: + val = (formatted, subsizes[1]) + sizes[attr] = val + return return_val(size, sizes) + + # if it is a leaf, just return the size + # TODO: any special handling for types that are not already accounted for above + elif is_leaf(o): + return size, + + # if it is a dictionary, then get the size of keys and values + elif isinstance(o, Mapping): + sizes = {} + total_size = size + for key, value in o.items(): + subsizes = size_tree(value, cutoff) + entry_size = subsizes[0] + get_size(key) + total_size += entry_size + if entry_size > cutoff: + formatted = float('{:.2f}'.format(entry_size)) + if len(subsizes) == 1: + val = formatted + else: + val = (formatted, subsizes[1]) + sizes[key] = val + return return_val(total_size, sizes) + + # if it is a sequence, then get the size of each element + elif isinstance(o, Sequence): + sizes = [] + total_size = size + for value in o: + subsizes = size_tree(value, cutoff) + total_size += subsizes[0] + if subsizes[0] > cutoff: + formatted = float('{:.2f}'.format(subsizes[0])) + if len(subsizes) == 1: + val = formatted + else: + val = (formatted, subsizes[1]) + sizes.append(val) + return return_val(total_size, sizes) + + else: + return size, + +def _are_instances_of(a, b, a_type): + """ + Return True if `a` and `b` are both instances of the given type (or tuple + of types). + """ + return isinstance(a, a_type) and isinstance(b, a_type) + +def diff_trees(a, b): + """ + Find the differences between two trees or leaf nodes a and b. Return a + falsey value if the inputs match OR a truthy value that explains or + summarizes their differences, where each point in the tree where the inputs + differ will be a tuple (a's value, b's value, optional description). + + Floating point numbers are compared with the tolerance set by the constant + NULP (Number of Units in the Last Place), allowing for NaN and infinite + values. (Adjust the tolerance level NULP if needed.) + + This operation is symmetrical. + """ + + # treat str and Python 2 unicode as the same leaf type + # ditto for int and Python 2 long + if (_are_instances_of(a, b, str) + or _are_instances_of(a, b, int)): + if a != b: + return elide(a), elide(b) + + # if they aren't they same type, they are clearly different. Also this lets us + # safely assume throughout the rest of the function that a and b are the same type + elif type(a) != type(b): + return elide(a, max_len=400), elide(b, max_len=400) + + # if they are floats, handle various kinds of values + elif isinstance(a, float): + return compare_floats(a, b) + + # if they are numpy arrays, compare them using a numpy testing function + elif isinstance(a, np.ndarray): + return compare_ndarrays(a, b) + + # if they are Unums compare their contents with matching units + elif isinstance(a, unum.Unum): + a0, b0 = a.matchUnits(b) + return diff_trees(a0.asNumber(), b0.asNumber()) + + # if a special object, check predefined attributes for equality + elif type(a) in SPECIAL_OBJECTS: + diff = {} + attrs = SPECIAL_OBJECTS[type(a)] + for attr in attrs: + subdiff = diff_trees(getattr(a, attr), getattr(b, attr)) + if subdiff: + diff[attr] = subdiff + return diff + + # if they are leafs (including strings) use python equality comparison + elif is_leaf(a): + if a != b: + return elide(a), elide(b) + + # if they are dictionaries then diff the value under each key + elif isinstance(a, Mapping): + diff = {} + na = Repr('--') + nb = Repr('--') + for key in set(a.keys()) | set(b.keys()): + subdiff = diff_trees(a.get(key, na), b.get(key, nb)) + if subdiff: + diff[key] = subdiff + return diff + + # if they are sequences then compare each element at each index + elif isinstance(a, Sequence): + if len(a) > len(b): + b = list(b) + (len(a) - len(b)) * [Repr('--')] + elif len(b) > len(a): + a = list(a) + (len(b) - len(a)) * [Repr('--')] + + diff = [] + for index in range(len(a)): + subdiff = diff_trees(a[index], b[index]) + if subdiff: + diff.append(subdiff) + return diff + + # this should never happen + else: + print('value not considered by `diff_trees`: {} {}'.format(a, b)) + +def elide(value, max_len=200): + '''Return a value with the same repr but elided if it'd be longer than max.''' + repr_ = repr(value) + if len(repr_) > max_len: + return Repr(repr_[:max_len] + '...') + return value + +def simplify_error_message(message): + return elide(Repr(WHITESPACE.sub(' ', message).strip())) + +def compare_floats(f1, f2): + '''Compare two floats, allowing some tolerance, NaN, and Inf values. This + considers all types of NaN to match. + Return 0.0 (which is falsey) if they match, else (f1, f2). + ''' + if f1 == f2 or np.isnan(f1) and np.isnan(f2): + return 0.0 + try: + np.testing.assert_array_almost_equal_nulp(f1, f2, nulp=NULP) + return 0.0 + except AssertionError: + # FWIW, the string error.args[0] tells the NULP difference. + return f1, f2 + +def compare_ndarrays(array1, array2): + '''Compare two ndarrays, checking the shape and all elements, allowing for + NaN values and non-numeric values. Return () if they match, else a tuple of + diff info or just a diff description. + + TODO(jerry): Allow tolerance for float elements of structured arrays and + handle NaN and Inf values. + ''' + + def summarize_array(ndarray): + return Repr(f"array({ndarray.shape} {ndarray.dtype})") + + if array1.shape != array2.shape: + return summarize_array(array1), summarize_array(array2) + + object_dtype = np.dtype(object) + if issubclass(array1.dtype.type, np.floating): + try: + # This handles float tolerance but not NaN and Inf. + with np.errstate(invalid='ignore'): + np.testing.assert_array_almost_equal_nulp(array1, array2, nulp=NULP) + return () + except AssertionError as _: + # return elide(array1), elide(array2), simplify_error_message(e.args[0]) + pass # try again, below + # Handle ragged arrays created with an object dtype + elif array1.dtype == object_dtype and array2.dtype == object_dtype: + try: + assert array1.shape == array2.shape + for sub1, sub2 in zip(array1, array2): + np.testing.assert_equal(sub1, sub2) + return () + except AssertionError as e: + return simplify_error_message(e.args[0]) + + try: + # This handles non-float dtypes, also NaN and Inf, but no tolerance. + np.testing.assert_array_equal(array1, array2) + return () + except AssertionError as e: + return simplify_error_message(e.args[0]) + + +def load_tree(path): + """Load a .cPickle file as an object_tree.""" + with open(path, "rb") as f: + data = pickle.load(f, fix_imports=True, encoding='latin1') + return object_tree(data) + + +def load_fit_tree(out_subdir): + '''Load the parameter calculator's (Parca's) output as an object_tree.''' + # For convenience, optionally add the prefix 'out/'. + if not os.path.isabs(out_subdir) and not os.path.isdir(out_subdir): + out_subdir = os.path.join('out', out_subdir) + + path = os.path.join( + out_subdir, + constants.KB_DIR, + constants.SERIALIZED_SIM_DATA_FILENAME) + + return load_tree(path) + +def pprint_diffs(diffs, *, width=160, print_diff_lines=True, print_count=True): + '''Pretty-print the diff info: optionally print the detailed diff lines, + optionally print the diff line count as a single figure of merit; then + return the line count. + ''' + if diffs: + diff_lines = pformat(diffs, width=width) + if print_diff_lines: + print(diff_lines) + line_count = len(diff_lines.strip().splitlines()) + else: + line_count = 0 + + if print_count: + print('==> lines of differences: {}'.format(line_count)) + return line_count + + +def diff_files(path1: str, path2: str, print_diff_lines: bool = True) -> int: + """Diff the pair of named pickle files. Return the diff line count.""" + tree1 = load_tree(path1) + tree2 = load_tree(path2) + diffs = diff_trees(tree1, tree2) + return pprint_diffs(diffs, print_diff_lines=print_diff_lines) + + +def list_pickles(directory: str) -> Dict[str, str]: + """Return a map of .cPickle file names to paths in the given directory + sorted by file modification time then by filename.""" + entries = [(entry.stat().st_mtime, entry.name, entry.path) + for entry in os.scandir(directory) + if entry.is_file() and entry.name.endswith('.cPickle')] + files = {e[1]: e[2] for e in sorted(entries)} + return files + + +def diff_dirs(dir1: str, dir2: str, print_diff_lines: bool = True) -> int: + """Diff the pickle files in the pair of named directories. Return the total + diff line count.""" + print(f'Comparing pickle files in "{dir1}" vs. "{dir2}".') + pickles1 = list_pickles(dir1) + pickles2 = list_pickles(dir2) + count = 0 + + for name, path1 in pickles1.items(): + print(f'\n*** {name} {"*" * (75 - len(name))}') + path2 = pickles2.get(name) + if path2: + count += diff_files(path1, path2, print_diff_lines) + else: + print(f'{name} is in {dir1} but not {dir2}') + count += 1 + + only_in_dir2 = pickles2.keys() - pickles1.keys() + if only_in_dir2: + print(f'\n*** Pickle files in {dir2} but not {dir1}:\n' + f'{sorted(only_in_dir2)}') + count += len(only_in_dir2) + + print(f'\n====> Total differences: {count} lines for {len(pickles1)} pickle' + f' files in {dir1} against {len(pickles2)} pickle files in {dir2}.') + return count + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Compare two .cPickle files" + " or all the .cPickle files in two directories (in" + " modification-time order)." + " Print a count and optionally a summary of the differences.") + parser.add_argument('-c', '--count', action='store_true', + help="Print just the diff line count for each file, skipping the" + " detailed diff lines.") + parser.add_argument('-f', '--final-sim-data', action='store_true', + help="Append /kb/simData.cPickle to the two PATH args to make it a" + " little easier compare the final Parca output sim_data.") + parser.add_argument('path', metavar='PATH', nargs=2, + help="The two pickle files or directories to compare.") + + args = parser.parse_args() + path1, path2 = args.path + + if args.final_sim_data: + path1 = os.path.join(path1, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME) + path2 = os.path.join(path2, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME) + + if os.path.isfile(path1): + diff_count = diff_files(path1, path2, print_diff_lines=not args.count) + else: + diff_count = diff_dirs(path1, path2, print_diff_lines=not args.count) + + sys.exit(3 if diff_count else 0) diff --git a/runscripts/debug/diff_simouts.py b/runscripts/debug/diff_simouts.py new file mode 100644 index 000000000..5575d9e23 --- /dev/null +++ b/runscripts/debug/diff_simouts.py @@ -0,0 +1,24 @@ +import argparse +import duckdb + +from ecoli.library.parquet_emitter import get_dataset_sql + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Compare simulation output in two ouput directories") + parser.add_argument('path', metavar='PATH', nargs=2, + help="The two output directories to compare.") + + args = parser.parse_args() + path1, path2 = args.path + + h_1, c_1 = get_dataset_sql(path1) + h_2, c_2 = get_dataset_sql(path2) + id_cols = "experiment_id, variant, lineage_seed, generation, agent_id, time" + ordered_sql = f"SELECT * FROM ({{sql_query}}) ORDER BY {id_cols}" + for sql_1, sql_2 in [(h_1, h_2), (c_1, c_2)]: + data_1 = duckdb.sql(ordered_sql.format(sql_query=sql_1)).arrow() + data_2 = duckdb.sql(ordered_sql.format(sql_query=sql_2)).arrow() + assert data_1.column_names == data_2.column_names, "Different columns." + for i, (col_1, col_2) in enumerate(zip(data_1, data_2)): + assert col_1 == col_2, f"{data_1.column_names[i]} is different." diff --git a/runscripts/jenkins/check-reproducibility.sh b/runscripts/jenkins/check-reproducibility.sh new file mode 100644 index 000000000..a8f7b848e --- /dev/null +++ b/runscripts/jenkins/check-reproducibility.sh @@ -0,0 +1,40 @@ +#! /usr/bin/env bash + +# Runs the parca and sims twice to compare output to ensure both +# are reproducible from run to run. Will exit with an error code +# if either the parca or sim runs produce different output from +# each other. + +set -eu + +script_dir=$(dirname $(dirname $(realpath $0))) +out_dir=$(dirname $script_dir)/out +dir1="check-reproducibility-1" +dir2="check-reproducibility-2" +git_hash=$(git rev-parse HEAD) + +source $script_dir/jenkins/setup-environment.sh + +# Run parca twice and check that output is consistent from run to run +python $script_dir/parca.py -c 4 -o $out_dir/$dir1-$git_hash --save-intermediates +python $script_dir/parca.py -c 4 -o $out_dir/$dir2-$git_hash --save-intermediates +$script_dir/debug/compare_pickles.py $out_dir/$dir1-$git_hash/kb $out_dir/$dir2-$git_hash/kb + +# Run entire simulation for each parca output +python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \ + --config $script_dir/jenkins/configs/reproducibility.json \ + --experiment_id $dir1-$git_hash --daughter_outdir $out_dir/$dir1-$git_hash \ + --sim_data_path $out_dir/$dir1-$git_hash/kb/simData.cPickle +python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \ + --config $script_dir/jenkins/configs/reproducibility.json \ + --experiment_id $dir2-$git_hash --daughter_outdir $out_dir/$dir2-$git_hash \ + --sim_data_path $out_dir/$dir2-$git_hash/kb/simData.cPickle + +# Run short daughters to check that everything including division is consistent +python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \ + --config $script_dir/jenkins/configs/reproducibility.json \ + --experiment_id $dir1-$git_hash --agent_id "00" --total_time 10 +python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \ + --config $script_dir/jenkins/configs/reproducibility.json \ + --experiment_id $dir2-$git_hash --agent_id "01" --total_time 10 +$script_dir/debug/diff_simouts.py $out_dir/$dir1-$git_hash $out_dir/$dir2-$git_hash diff --git a/runscripts/jenkins/configs/ecoli-glucose-minimal.json b/runscripts/jenkins/configs/ecoli-glucose-minimal.json new file mode 100644 index 000000000..e6bac3c76 --- /dev/null +++ b/runscripts/jenkins/configs/ecoli-glucose-minimal.json @@ -0,0 +1,15 @@ +{ + "experiment_id": "Daily build", + "single_daughters": true, + "generations": 16, + "fail_at_total_time": true, + "emitter" : { + "type": "parquet", + "config": { + "out_dir": "/scratch/groups/mcovert/vecoli" + } + }, + "analysis_options": { + "single": {"mass_fraction_summary": {}} + } +} diff --git a/runscripts/jenkins/configs/reproducibility.json b/runscripts/jenkins/configs/reproducibility.json new file mode 100644 index 000000000..b169dd053 --- /dev/null +++ b/runscripts/jenkins/configs/reproducibility.json @@ -0,0 +1,10 @@ +{ + "generations": 1, + "fail_at_total_time": true, + "emitter" : { + "type": "parquet", + "config": { + "out_dir": "out" + } + } +} diff --git a/runscripts/jenkins/setup-environment.sh b/runscripts/jenkins/setup-environment.sh new file mode 100644 index 000000000..2232d1d2a --- /dev/null +++ b/runscripts/jenkins/setup-environment.sh @@ -0,0 +1,14 @@ +set -e + +export PYTHONPATH=$PWD +module load wcEcoli/python3 java/18.0.2 nextflow + +export PATH="${GROUP_HOME}/pyenv/bin:${PATH}" +eval "$(pyenv init -)" +eval "$(pyenv virtualenv-init -)" + +### Edit this line to make this branch use another pyenv +pyenv local viv-ecoli +pyenv activate + +make clean compile diff --git a/runscripts/jenkins/workflow.sh b/runscripts/jenkins/workflow.sh new file mode 100644 index 000000000..5435f6529 --- /dev/null +++ b/runscripts/jenkins/workflow.sh @@ -0,0 +1,5 @@ +set -e + +source runscripts/jenkins/setup-environment.sh + +python runscripts/workflow.py --config $1 From 5e1417e3359498b95cd75955359ab1ac6e7996ec Mon Sep 17 00:00:00 2001 From: thalassemia Date: Tue, 2 Jul 2024 21:43:56 -0700 Subject: [PATCH 5/5] Jenkins reproducibility and two gen tests --- runscripts/debug/compare_pickles.py | 196 +++++++++++------- runscripts/debug/diff_simouts.py | 47 +++-- runscripts/jenkins/check-reproducibility.sh | 26 ++- .../jenkins/configs/reproducibility.json | 2 +- .../jenkins/configs/two_generations.json | 19 ++ 5 files changed, 189 insertions(+), 101 deletions(-) create mode 100644 runscripts/jenkins/configs/two_generations.json diff --git a/runscripts/debug/compare_pickles.py b/runscripts/debug/compare_pickles.py index 37c75504c..1c7a7fe4d 100644 --- a/runscripts/debug/compare_pickles.py +++ b/runscripts/debug/compare_pickles.py @@ -5,6 +5,7 @@ Usage (PATH is a path like 'out/manual/intermediates'): runscripts/debug/comparePickles.py PATH1 PATH2 """ + import argparse from collections.abc import Mapping, Sequence import functools @@ -32,9 +33,9 @@ # Objects with a list of attributes to compare SPECIAL_OBJECTS = { - scipy.interpolate.CubicSpline: ['x', 'c', 'axis'], - wholecell.utils.unit_struct_array.UnitStructArray: ['struct_array', 'units'], - } + scipy.interpolate.CubicSpline: ["x", "c", "axis"], + wholecell.utils.unit_struct_array.UnitStructArray: ["struct_array", "units"], +} LEAF_TYPES = ( unum.Unum, @@ -44,13 +45,15 @@ functools.partial, types.FunctionType, dense.MutableDenseMatrix, - wholecell.utils.unit_struct_array.UnitStructArray) + wholecell.utils.unit_struct_array.UnitStructArray, +) -WHITESPACE = re.compile(r'\s+') +WHITESPACE = re.compile(r"\s+") class Repr(object): - '''A Repr has the given repr() string without quotes and != any other value.''' + """A Repr has the given repr() string without quotes and != any other value.""" + def __init__(self, repr_): self.repr_ = repr_ @@ -64,7 +67,8 @@ def has_python_vars(obj): ordinary fields or compact slots. If not, it's presumably a built-in type or extension type implemented entirely in C and Cython. """ - return hasattr(obj, '__dict__') or hasattr(obj, '__slots__') + return hasattr(obj, "__dict__") or hasattr(obj, "__slots__") + def all_vars(obj): """ @@ -73,14 +77,15 @@ def all_vars(obj): If the object implements the pickling method `__getstate__`, call that instead to get its defining state. """ - if hasattr(obj, '__getstate__'): + if hasattr(obj, "__getstate__"): # noinspection PyCallingNonCallable return obj.__getstate__() - attrs = getattr(obj, '__dict__', {}) - attrs.update({key: getattr(obj, key) for key in getattr(obj, '__slots__', ())}) + attrs = getattr(obj, "__dict__", {}) + attrs.update({key: getattr(obj, key) for key in getattr(obj, "__slots__", ())}) return attrs + def is_leaf(value, leaves=LEAF_TYPES): """ Predicate to determine if we have reached the end of how deep we want to traverse @@ -88,11 +93,14 @@ def is_leaf(value, leaves=LEAF_TYPES): """ if isinstance(value, (Mapping, Sequence)): return isinstance(value, (bytes, str)) - return (callable(value) # it's callable - or isinstance(value, leaves) # it's an instance of a declared leaf type - or not has_python_vars(value)) # an object without Python instance variables + return ( + callable(value) # it's callable + or isinstance(value, leaves) # it's an instance of a declared leaf type + or not has_python_vars(value) + ) # an object without Python instance variables + -def object_tree(obj, path='', debug=None): +def object_tree(obj, path="", debug=None): """ Diagnostic tool to inspect a complex data structure. @@ -115,26 +123,34 @@ def object_tree(obj, path='', debug=None): it will only print methods and functions it finds. """ - if debug == 'ALL': + if debug == "ALL": print(path) if is_leaf(obj): - if callable(obj) and (debug == 'CALLABLE'): - print('{}: {}'.format(path, obj)) + if callable(obj) and (debug == "CALLABLE"): + print("{}: {}".format(path, obj)) return obj elif isinstance(obj, Mapping): - return {key: object_tree(value, "{}['{}']".format(path, key), debug) - for (key, value) in obj.items()} + return { + key: object_tree(value, "{}['{}']".format(path, key), debug) + for (key, value) in obj.items() + } elif isinstance(obj, Sequence): - return [object_tree(subobj, "{}[{}]".format(path, index), debug) for index, subobj in enumerate(obj)] + return [ + object_tree(subobj, "{}[{}]".format(path, index), debug) + for index, subobj in enumerate(obj) + ] else: attrs = all_vars(obj) - tree = {key: object_tree(value, "{}.{}".format(path, key), debug) - for (key, value) in attrs.items()} - tree['!type'] = type(obj) + tree = { + key: object_tree(value, "{}.{}".format(path, key), debug) + for (key, value) in attrs.items() + } + tree["!type"] = type(obj) return tree + def size_tree(o, cutoff=0.1): """ Find the size of attributes in an object tree. Sizes greater than the cutoff @@ -149,7 +165,7 @@ def return_val(total, value): if total > cutoff and value: return total, value else: - return total, + return (total,) def get_size(o): return sys.getsizeof(o) / 2**20 # convert to MB @@ -160,12 +176,12 @@ def get_size(o): if isinstance(o, unum.Unum): size += size_tree(o._unit)[0] size += get_size(o._value) - return size, + return (size,) # special handling of leaf to get size of str sequence elif isinstance(o, Bio.Seq.Seq): size += get_size(o._data) - return size, + return (size,) # special handling of leaf, each entry is allocated the same amount of space elif isinstance(o, wholecell.utils.unit_struct_array.UnitStructArray): @@ -173,7 +189,7 @@ def get_size(o): n_entries = len(o.struct_array) if n_entries: size += get_size(o.struct_array[0]) * n_entries - return size, + return (size,) # if a special object, check predefined attributes for equality elif type(o) in SPECIAL_OBJECTS: @@ -183,7 +199,7 @@ def get_size(o): subsizes = size_tree(getattr(o, attr), cutoff) size += subsizes[0] if subsizes[0] > cutoff: - formatted = float('{:.2f}'.format(subsizes[0])) + formatted = float("{:.2f}".format(subsizes[0])) if len(subsizes) == 1: val = formatted else: @@ -194,7 +210,7 @@ def get_size(o): # if it is a leaf, just return the size # TODO: any special handling for types that are not already accounted for above elif is_leaf(o): - return size, + return (size,) # if it is a dictionary, then get the size of keys and values elif isinstance(o, Mapping): @@ -205,7 +221,7 @@ def get_size(o): entry_size = subsizes[0] + get_size(key) total_size += entry_size if entry_size > cutoff: - formatted = float('{:.2f}'.format(entry_size)) + formatted = float("{:.2f}".format(entry_size)) if len(subsizes) == 1: val = formatted else: @@ -221,7 +237,7 @@ def get_size(o): subsizes = size_tree(value, cutoff) total_size += subsizes[0] if subsizes[0] > cutoff: - formatted = float('{:.2f}'.format(subsizes[0])) + formatted = float("{:.2f}".format(subsizes[0])) if len(subsizes) == 1: val = formatted else: @@ -230,7 +246,8 @@ def get_size(o): return return_val(total_size, sizes) else: - return size, + return (size,) + def _are_instances_of(a, b, a_type): """ @@ -239,6 +256,7 @@ def _are_instances_of(a, b, a_type): """ return isinstance(a, a_type) and isinstance(b, a_type) + def diff_trees(a, b): """ Find the differences between two trees or leaf nodes a and b. Return a @@ -255,8 +273,7 @@ def diff_trees(a, b): # treat str and Python 2 unicode as the same leaf type # ditto for int and Python 2 long - if (_are_instances_of(a, b, str) - or _are_instances_of(a, b, int)): + if _are_instances_of(a, b, str) or _are_instances_of(a, b, int): if a != b: return elide(a), elide(b) @@ -296,8 +313,8 @@ def diff_trees(a, b): # if they are dictionaries then diff the value under each key elif isinstance(a, Mapping): diff = {} - na = Repr('--') - nb = Repr('--') + na = Repr("--") + nb = Repr("--") for key in set(a.keys()) | set(b.keys()): subdiff = diff_trees(a.get(key, na), b.get(key, nb)) if subdiff: @@ -307,9 +324,9 @@ def diff_trees(a, b): # if they are sequences then compare each element at each index elif isinstance(a, Sequence): if len(a) > len(b): - b = list(b) + (len(a) - len(b)) * [Repr('--')] + b = list(b) + (len(a) - len(b)) * [Repr("--")] elif len(b) > len(a): - a = list(a) + (len(b) - len(a)) * [Repr('--')] + a = list(a) + (len(b) - len(a)) * [Repr("--")] diff = [] for index in range(len(a)): @@ -320,23 +337,26 @@ def diff_trees(a, b): # this should never happen else: - print('value not considered by `diff_trees`: {} {}'.format(a, b)) + print("value not considered by `diff_trees`: {} {}".format(a, b)) + def elide(value, max_len=200): - '''Return a value with the same repr but elided if it'd be longer than max.''' + """Return a value with the same repr but elided if it'd be longer than max.""" repr_ = repr(value) if len(repr_) > max_len: - return Repr(repr_[:max_len] + '...') + return Repr(repr_[:max_len] + "...") return value + def simplify_error_message(message): - return elide(Repr(WHITESPACE.sub(' ', message).strip())) + return elide(Repr(WHITESPACE.sub(" ", message).strip())) + def compare_floats(f1, f2): - '''Compare two floats, allowing some tolerance, NaN, and Inf values. This + """Compare two floats, allowing some tolerance, NaN, and Inf values. This considers all types of NaN to match. Return 0.0 (which is falsey) if they match, else (f1, f2). - ''' + """ if f1 == f2 or np.isnan(f1) and np.isnan(f2): return 0.0 try: @@ -346,14 +366,15 @@ def compare_floats(f1, f2): # FWIW, the string error.args[0] tells the NULP difference. return f1, f2 + def compare_ndarrays(array1, array2): - '''Compare two ndarrays, checking the shape and all elements, allowing for + """Compare two ndarrays, checking the shape and all elements, allowing for NaN values and non-numeric values. Return () if they match, else a tuple of diff info or just a diff description. TODO(jerry): Allow tolerance for float elements of structured arrays and handle NaN and Inf values. - ''' + """ def summarize_array(ndarray): return Repr(f"array({ndarray.shape} {ndarray.dtype})") @@ -365,7 +386,7 @@ def summarize_array(ndarray): if issubclass(array1.dtype.type, np.floating): try: # This handles float tolerance but not NaN and Inf. - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): np.testing.assert_array_almost_equal_nulp(array1, array2, nulp=NULP) return () except AssertionError as _: @@ -392,28 +413,28 @@ def summarize_array(ndarray): def load_tree(path): """Load a .cPickle file as an object_tree.""" with open(path, "rb") as f: - data = pickle.load(f, fix_imports=True, encoding='latin1') + data = pickle.load(f, fix_imports=True, encoding="latin1") return object_tree(data) def load_fit_tree(out_subdir): - '''Load the parameter calculator's (Parca's) output as an object_tree.''' + """Load the parameter calculator's (Parca's) output as an object_tree.""" # For convenience, optionally add the prefix 'out/'. if not os.path.isabs(out_subdir) and not os.path.isdir(out_subdir): - out_subdir = os.path.join('out', out_subdir) + out_subdir = os.path.join("out", out_subdir) path = os.path.join( - out_subdir, - constants.KB_DIR, - constants.SERIALIZED_SIM_DATA_FILENAME) + out_subdir, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME + ) return load_tree(path) + def pprint_diffs(diffs, *, width=160, print_diff_lines=True, print_count=True): - '''Pretty-print the diff info: optionally print the detailed diff lines, + """Pretty-print the diff info: optionally print the detailed diff lines, optionally print the diff line count as a single figure of merit; then return the line count. - ''' + """ if diffs: diff_lines = pformat(diffs, width=width) if print_diff_lines: @@ -423,7 +444,7 @@ def pprint_diffs(diffs, *, width=160, print_diff_lines=True, print_count=True): line_count = 0 if print_count: - print('==> lines of differences: {}'.format(line_count)) + print("==> lines of differences: {}".format(line_count)) return line_count @@ -438,9 +459,11 @@ def diff_files(path1: str, path2: str, print_diff_lines: bool = True) -> int: def list_pickles(directory: str) -> Dict[str, str]: """Return a map of .cPickle file names to paths in the given directory sorted by file modification time then by filename.""" - entries = [(entry.stat().st_mtime, entry.name, entry.path) - for entry in os.scandir(directory) - if entry.is_file() and entry.name.endswith('.cPickle')] + entries = [ + (entry.stat().st_mtime, entry.name, entry.path) + for entry in os.scandir(directory) + if entry.is_file() and entry.name.endswith(".cPickle") + ] files = {e[1]: e[2] for e in sorted(entries)} return files @@ -459,40 +482,61 @@ def diff_dirs(dir1: str, dir2: str, print_diff_lines: bool = True) -> int: if path2: count += diff_files(path1, path2, print_diff_lines) else: - print(f'{name} is in {dir1} but not {dir2}') + print(f"{name} is in {dir1} but not {dir2}") count += 1 only_in_dir2 = pickles2.keys() - pickles1.keys() if only_in_dir2: - print(f'\n*** Pickle files in {dir2} but not {dir1}:\n' - f'{sorted(only_in_dir2)}') + print( + f"\n*** Pickle files in {dir2} but not {dir1}:\n" f"{sorted(only_in_dir2)}" + ) count += len(only_in_dir2) - print(f'\n====> Total differences: {count} lines for {len(pickles1)} pickle' - f' files in {dir1} against {len(pickles2)} pickle files in {dir2}.') + print( + f"\n====> Total differences: {count} lines for {len(pickles1)} pickle" + f" files in {dir1} against {len(pickles2)} pickle files in {dir2}." + ) return count -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser( description="Compare two .cPickle files" - " or all the .cPickle files in two directories (in" - " modification-time order)." - " Print a count and optionally a summary of the differences.") - parser.add_argument('-c', '--count', action='store_true', + " or all the .cPickle files in two directories (in" + " modification-time order)." + " Print a count and optionally a summary of the differences." + ) + parser.add_argument( + "-c", + "--count", + action="store_true", help="Print just the diff line count for each file, skipping the" - " detailed diff lines.") - parser.add_argument('-f', '--final-sim-data', action='store_true', + " detailed diff lines.", + ) + parser.add_argument( + "-f", + "--final-sim-data", + action="store_true", help="Append /kb/simData.cPickle to the two PATH args to make it a" - " little easier compare the final Parca output sim_data.") - parser.add_argument('path', metavar='PATH', nargs=2, - help="The two pickle files or directories to compare.") + " little easier compare the final Parca output sim_data.", + ) + parser.add_argument( + "path", + metavar="PATH", + nargs=2, + help="The two pickle files or directories to compare.", + ) args = parser.parse_args() path1, path2 = args.path if args.final_sim_data: - path1 = os.path.join(path1, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME) - path2 = os.path.join(path2, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME) + path1 = os.path.join( + path1, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME + ) + path2 = os.path.join( + path2, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME + ) if os.path.isfile(path1): diff_count = diff_files(path1, path2, print_diff_lines=not args.count) diff --git a/runscripts/debug/diff_simouts.py b/runscripts/debug/diff_simouts.py index 5575d9e23..5ad4b74cf 100644 --- a/runscripts/debug/diff_simouts.py +++ b/runscripts/debug/diff_simouts.py @@ -1,24 +1,43 @@ import argparse import duckdb +import numpy as np -from ecoli.library.parquet_emitter import get_dataset_sql +from ecoli.library.parquet_emitter import get_dataset_sql, ndlist_to_ndarray if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Compare simulation output in two ouput directories") - parser.add_argument('path', metavar='PATH', nargs=2, - help="The two output directories to compare.") + description="Compare simulation output of two experiment IDs." + ) + parser.add_argument( + "-o", + "--output", + help="Output directory containing data for two experiment IDs.", + ) + parser.add_argument( + "exp_ids", metavar="EXP_ID", nargs=2, help="The two experiment IDs to compare." + ) args = parser.parse_args() - path1, path2 = args.path + exp_id_1, exp_id_2 = args.exp_ids - h_1, c_1 = get_dataset_sql(path1) - h_2, c_2 = get_dataset_sql(path2) + history_sql, _ = get_dataset_sql(args.output) id_cols = "experiment_id, variant, lineage_seed, generation, agent_id, time" - ordered_sql = f"SELECT * FROM ({{sql_query}}) ORDER BY {id_cols}" - for sql_1, sql_2 in [(h_1, h_2), (c_1, c_2)]: - data_1 = duckdb.sql(ordered_sql.format(sql_query=sql_1)).arrow() - data_2 = duckdb.sql(ordered_sql.format(sql_query=sql_2)).arrow() - assert data_1.column_names == data_2.column_names, "Different columns." - for i, (col_1, col_2) in enumerate(zip(data_1, data_2)): - assert col_1 == col_2, f"{data_1.column_names[i]} is different." + ordered_sql = f"SELECT * FROM ({{sql_query}}) WHERE experiment_id = '{{exp_id}}' ORDER BY {id_cols}" + data_1 = duckdb.sql( + ordered_sql.format(sql_query=history_sql, exp_id=exp_id_1) + ).arrow() + data_2 = duckdb.sql( + ordered_sql.format(sql_query=history_sql, exp_id=exp_id_2) + ).arrow() + assert data_1.column_names == data_2.column_names, "Different columns." + for i, (col_1, col_2) in enumerate(zip(data_1, data_2)): + if col_1 != col_2 and data_1.column_names[i] not in [ + "experiment_id", + "agent_id", + ]: + np.testing.assert_allclose( + ndlist_to_ndarray(col_1), + ndlist_to_ndarray(col_2), + atol=1e-12, + err_msg=f"{data_1.column_names[i]} not equal", + ) diff --git a/runscripts/jenkins/check-reproducibility.sh b/runscripts/jenkins/check-reproducibility.sh index a8f7b848e..1115a2f96 100644 --- a/runscripts/jenkins/check-reproducibility.sh +++ b/runscripts/jenkins/check-reproducibility.sh @@ -9,6 +9,7 @@ set -eu script_dir=$(dirname $(dirname $(realpath $0))) out_dir=$(dirname $script_dir)/out +data_dir=$(dirname $script_dir)/data dir1="check-reproducibility-1" dir2="check-reproducibility-2" git_hash=$(git rev-parse HEAD) @@ -16,25 +17,30 @@ git_hash=$(git rev-parse HEAD) source $script_dir/jenkins/setup-environment.sh # Run parca twice and check that output is consistent from run to run -python $script_dir/parca.py -c 4 -o $out_dir/$dir1-$git_hash --save-intermediates -python $script_dir/parca.py -c 4 -o $out_dir/$dir2-$git_hash --save-intermediates -$script_dir/debug/compare_pickles.py $out_dir/$dir1-$git_hash/kb $out_dir/$dir2-$git_hash/kb +python $script_dir/parca.py -c 4 -o "$out_dir/$dir1-$git_hash" +python $script_dir/parca.py -c 4 -o "$out_dir/$dir2-$git_hash" +python $script_dir/debug/compare_pickles.py "$out_dir/$dir1-$git_hash/kb" "$out_dir/$dir2-$git_hash/kb" # Run entire simulation for each parca output python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \ --config $script_dir/jenkins/configs/reproducibility.json \ - --experiment_id $dir1-$git_hash --daughter_outdir $out_dir/$dir1-$git_hash \ - --sim_data_path $out_dir/$dir1-$git_hash/kb/simData.cPickle + --experiment_id "$dir1-$git_hash" --daughter_outdir "$out_dir/$dir1-$git_hash" \ + --sim_data_path "$out_dir/$dir1-$git_hash/kb/simData.cPickle" --fail_at_total_time python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \ --config $script_dir/jenkins/configs/reproducibility.json \ - --experiment_id $dir2-$git_hash --daughter_outdir $out_dir/$dir2-$git_hash \ - --sim_data_path $out_dir/$dir2-$git_hash/kb/simData.cPickle + --experiment_id "$dir2-$git_hash" --daughter_outdir "$out_dir/$dir2-$git_hash" \ + --sim_data_path "$out_dir/$dir2-$git_hash/kb/simData.cPickle" --fail_at_total_time # Run short daughters to check that everything including division is consistent +ln -s "$out_dir/$dir1-$git_hash/daughter_state_0.json" $data_dir/daughter_state_0.json python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \ --config $script_dir/jenkins/configs/reproducibility.json \ - --experiment_id $dir1-$git_hash --agent_id "00" --total_time 10 + --experiment_id "$dir1-$git_hash" --agent_id "00" --total_time 10 \ + --initial_state_file daughter_state_0 +rm $data_dir/daughter_state_0.json +ln -s "$out_dir/$dir2-$git_hash/daughter_state_0.json" $data_dir/daughter_state_0.json python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \ --config $script_dir/jenkins/configs/reproducibility.json \ - --experiment_id $dir2-$git_hash --agent_id "01" --total_time 10 -$script_dir/debug/diff_simouts.py $out_dir/$dir1-$git_hash $out_dir/$dir2-$git_hash + --experiment_id "$dir2-$git_hash" --agent_id "00" --total_time 10 \ + --initial_state_file daughter_state_0 +python $script_dir/debug/diff_simouts.py -o "$out_dir" "$dir1-$git_hash" "$dir2-$git_hash" diff --git a/runscripts/jenkins/configs/reproducibility.json b/runscripts/jenkins/configs/reproducibility.json index b169dd053..ccf6dfe4b 100644 --- a/runscripts/jenkins/configs/reproducibility.json +++ b/runscripts/jenkins/configs/reproducibility.json @@ -1,6 +1,6 @@ { + "suffix_time": false, "generations": 1, - "fail_at_total_time": true, "emitter" : { "type": "parquet", "config": { diff --git a/runscripts/jenkins/configs/two_generations.json b/runscripts/jenkins/configs/two_generations.json new file mode 100644 index 000000000..7563f93cf --- /dev/null +++ b/runscripts/jenkins/configs/two_generations.json @@ -0,0 +1,19 @@ +{ + "experiment_id": "two_generations", + "suffix_time": false, + "parca_options": { + "cpus": 3 + }, + "generations": 2, + "n_init_sims": 1, + "single_daughters": true, + "emitter" : { + "type": "parquet", + "config": { + "out_dir": "out" + } + }, + "analysis_options": { + "single": {"mass_fraction_summary": {}} + } +}