From 3a39091059fc307a4830c521ebc416aebef6aba8 Mon Sep 17 00:00:00 2001
From: thalassemia <cheah_sean@yahoo.com>
Date: Tue, 2 Jul 2024 13:59:05 -0700
Subject: [PATCH 1/5] Type checking and linting CI

---
 .github/workflows/pytest.yml                  | 51 +++++++++++++++++--
 .../analysis/antibiotics_colony/load_data.py  | 24 ++++-----
 .../snapshot_and_hist_plot.py                 | 12 ++---
 validation/ecoli/scripts/rnaDegRates.py       |  2 +-
 wholecell/utils/dependency_graph.py           |  8 ++-
 wholecell/utils/units.py                      |  4 +-
 6 files changed, 70 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 944cccf30..c386c3930 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -1,6 +1,6 @@
 # Modified from GitHub Actions template
 
-name: Pytest
+name: Basic tests and QA
 
 # Improves reproducibility and speed
 env:
@@ -14,7 +14,7 @@ on:
     branches: [master]
 
 jobs:
-  test:
+  Pytest:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -30,12 +30,53 @@ jobs:
         pip install --upgrade pip wheel
         pip install numpy==1.26.4
         pip install -r requirements.txt
-    - name: Setup
-      run: |
-        python setup.py install
     - name: Compile Cython components
       run: |
         make clean compile
     - name: Test with pytest
       run: |
         python -m pytest --cov=ecoli --durations=0
+  Mypy:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip wheel
+        pip install numpy==1.26.4 mypy
+        pip install -r requirements.txt
+    - name: Compile Cython components
+      run: |
+        make clean compile
+    - name: Mypy
+      run: |
+        mypy
+  Lint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip wheel
+        pip install numpy==1.26.4 ruff
+        pip install -r requirements.txt
+    - name: Compile Cython components
+      run: |
+        make clean compile
+    - name: Ruff
+      run: |
+        ruff check
diff --git a/ecoli/analysis/antibiotics_colony/load_data.py b/ecoli/analysis/antibiotics_colony/load_data.py
index ea87ea283..d62c4a430 100644
--- a/ecoli/analysis/antibiotics_colony/load_data.py
+++ b/ecoli/analysis/antibiotics_colony/load_data.py
@@ -53,23 +53,21 @@ def load_data(
     experiment_id=None, cpus=8, sampling_rate=2, host="10.138.0.75", port=27017
 ):
     # Get data for the specified experiment_id
-    monomers = [path[-1] for path in PATHS_TO_LOAD.values() if path[0] == "monomer"]
-    mrnas = [path[-1] for path in PATHS_TO_LOAD.values() if path[0] == "mrna"]
-    inner_paths = [
-        path
-        for path in PATHS_TO_LOAD.values()
-        if path[-1] not in mrnas
-        and path[-1] not in monomers
-        and path != ("total_mrna",)
-    ]
-    outer_paths = [("data", "dimensions"), ("data", "fields")]
+    # monomers = [path[-1] for path in PATHS_TO_LOAD.values() if path[0] == "monomer"]
+    # mrnas = [path[-1] for path in PATHS_TO_LOAD.values() if path[0] == "mrna"]
+    # inner_paths = [
+    #     path
+    #     for path in PATHS_TO_LOAD.values()
+    #     if path[-1] not in mrnas
+    #     and path[-1] not in monomers
+    #     and path != ("total_mrna",)
+    # ]
+    # outer_paths = [("data", "dimensions"), ("data", "fields")]
     for condition, seeds in EXPERIMENT_ID_MAPPING.items():
         for seed, curr_experiment_id in seeds.items():
             if curr_experiment_id != experiment_id:
                 continue
             metadata = {condition: {seed: {}}}
-            # TODO: Convert to use DuckDB
-            raise NotImplementedError("Still need to convert to use DuckDB!")
             rep_data = {}
             with ProcessPoolExecutor(cpus) as executor:
                 print("Deserializing data and removing units...")
@@ -127,6 +125,8 @@ def main():
     )
     args = parser.parse_args()
     os.makedirs("data/colony_data/sim_dfs/", exist_ok=True)
+    # TODO: Convert to use DuckDB
+    raise NotImplementedError("Still need to convert to use DuckDB!")
     load_data(args.experiment_id, cpus=args.cpus)
 
 
diff --git a/ecoli/analysis/antibiotics_colony/snapshot_and_hist_plot.py b/ecoli/analysis/antibiotics_colony/snapshot_and_hist_plot.py
index 2d48e0483..abbdb801d 100644
--- a/ecoli/analysis/antibiotics_colony/snapshot_and_hist_plot.py
+++ b/ecoli/analysis/antibiotics_colony/snapshot_and_hist_plot.py
@@ -212,12 +212,12 @@ def format_tick(tick):
 
 def get_data(experiment_id, time, molecules, host, port, cpus, verbose):
     # Prepare molecule paths for access_counts()
-    monomers = [m[-1] for m in molecules if m[-2] == "monomer"]
-    mrnas = [m[-1] for m in molecules if m[-2] == "mrna"]
-    inner_paths = [
-        path for path in molecules if path[-1] not in mrnas and path[-1] not in monomers
-    ]
-    outer_paths = [("data", "dimensions")]
+    # monomers = [m[-1] for m in molecules if m[-2] == "monomer"]
+    # mrnas = [m[-1] for m in molecules if m[-2] == "mrna"]
+    # inner_paths = [
+    #     path for path in molecules if path[-1] not in mrnas and path[-1] not in monomers
+    # ]
+    # outer_paths = [("data", "dimensions")]
 
     if verbose:
         print(f"Accessing data for experiment {experiment_id}...")
diff --git a/validation/ecoli/scripts/rnaDegRates.py b/validation/ecoli/scripts/rnaDegRates.py
index 03a1d5825..f9405bfe8 100644
--- a/validation/ecoli/scripts/rnaDegRates.py
+++ b/validation/ecoli/scripts/rnaDegRates.py
@@ -87,7 +87,7 @@ def main():
     plt.plot([0, maxLine], [0, maxLine], "--r")
     plt.plot(model, paper, "o", markeredgecolor="k", markerfacecolor="none")
     plt.axis((0, 1, 0, maxLine))
-    Correlation_ExpPred = np.corrcoef(model, paper)[0][1]
+    Correlation_ExpPred = np.corrcoef(model, paper)[0][1]  # noqa: F841
 
     plt.xlabel("RNA decay rate expected from model [1/min]")
     plt.ylabel("RNA decay rate from paper (Moffitt et al. 2016) [1/min]")
diff --git a/wholecell/utils/dependency_graph.py b/wholecell/utils/dependency_graph.py
index 4e6f71637..eff2f4e1b 100644
--- a/wholecell/utils/dependency_graph.py
+++ b/wholecell/utils/dependency_graph.py
@@ -29,10 +29,9 @@ class DependencyGraph(object):
     def __init__(self):
         # type: () -> None
         """Initialize dependencies to empty dictionary"""
-        self.dependencies = {}  # type: dict[str, list[str]]
+        self.dependencies: dict[str, list[str]] = {}
 
-    def add_nodes(self, nodes):
-        # type: (Iterable[str]) -> None
+    def add_nodes(self, nodes: Iterable[str]):
         """Add nodes with no dependencies
 
         Arguments:
@@ -41,8 +40,7 @@ def add_nodes(self, nodes):
         for node in nodes:
             self.dependencies[node] = []
 
-    def add_dep_relation(self, a, b):
-        # type: (str, str) -> None
+    def add_dep_relation(self, a: str, b: str):
         """Add an edge such that a depends on b
 
         If a or b does not exist yet as a node, it will be created.
diff --git a/wholecell/utils/units.py b/wholecell/utils/units.py
index bb2ed855e..bebfe5ec6 100644
--- a/wholecell/utils/units.py
+++ b/wholecell/utils/units.py
@@ -12,8 +12,8 @@
 import numpy as np
 
 # noinspection PyUnresolvedReferences
-from unum.units import mol, mmol, g, h, L, fg, min, s, umol, dmol, J, K  # satisfy mypy
-from unum.units import *
+from unum.units import mol, mmol, g, h, L, fg, min, s, umol, dmol, J, K  # noqa: F401
+from unum.units import *  # noqa: F403
 from unum import Unum
 
 count = Unum.unit("count", mol / scipy.constants.Avogadro)

From 50be203773d82bbb3afb8f9753409d35bbeec26f Mon Sep 17 00:00:00 2001
From: thalassemia <cheah_sean@yahoo.com>
Date: Tue, 2 Jul 2024 14:27:09 -0700
Subject: [PATCH 2/5] Rename wcecoli_state to json_state

---
 ecoli/composites/ecoli_master.py                  | 2 +-
 ecoli/composites/ecoli_spatial.py                 | 2 +-
 ecoli/experiments/ecoli_engine_process.py         | 2 +-
 ecoli/experiments/metabolism_exchanger.py         | 2 +-
 ecoli/experiments/metabolism_redux_sim.py         | 2 +-
 ecoli/experiments/tet_amp_sim.py                  | 2 +-
 ecoli/library/{wcecoli_state.py => json_state.py} | 0
 ecoli/processes/antibiotics/permeability.py       | 2 +-
 migration/allocator.py                            | 2 +-
 migration/chromosome_replication.py               | 2 +-
 migration/initial_state.py                        | 2 +-
 migration/migration_utils.py                      | 2 +-
 12 files changed, 11 insertions(+), 11 deletions(-)
 rename ecoli/library/{wcecoli_state.py => json_state.py} (100%)

diff --git a/ecoli/composites/ecoli_master.py b/ecoli/composites/ecoli_master.py
index 9a62a2c6a..bf8dd1a9d 100644
--- a/ecoli/composites/ecoli_master.py
+++ b/ecoli/composites/ecoli_master.py
@@ -38,7 +38,7 @@
 
 # state
 from ecoli.processes.partition import Requester, Evolver, Step, Process
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 
 SIM_DATA_PATH = os.path.abspath(
     os.path.join(
diff --git a/ecoli/composites/ecoli_spatial.py b/ecoli/composites/ecoli_spatial.py
index 6216e303a..581afb5f0 100644
--- a/ecoli/composites/ecoli_spatial.py
+++ b/ecoli/composites/ecoli_spatial.py
@@ -83,7 +83,7 @@
     DiffusionNetwork,
 )
 
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 from ecoli.library.schema import attrs, bulk_name_to_idx
 
 SIM_DATA_PATH = "reconstruction/sim_data/kb/simData.cPickle"
diff --git a/ecoli/experiments/ecoli_engine_process.py b/ecoli/experiments/ecoli_engine_process.py
index 4cf8a3958..4b1c61c2f 100644
--- a/ecoli/experiments/ecoli_engine_process.py
+++ b/ecoli/experiments/ecoli_engine_process.py
@@ -31,7 +31,7 @@
 from ecoli.library.logging_tools import write_json
 from ecoli.library.sim_data import RAND_MAX
 from ecoli.library.schema import not_a_process
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 from ecoli.processes.engine_process import EngineProcess
 from ecoli.processes.environment.field_timeline import FieldTimeline
 from ecoli.processes.environment.lysis import Lysis
diff --git a/ecoli/experiments/metabolism_exchanger.py b/ecoli/experiments/metabolism_exchanger.py
index 42a4dd7de..d88c02fe7 100644
--- a/ecoli/experiments/metabolism_exchanger.py
+++ b/ecoli/experiments/metabolism_exchanger.py
@@ -13,7 +13,7 @@
 
 # vivarium-ecoli imports
 from ecoli.library.sim_data import LoadSimData
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 from ecoli.composites.ecoli_master import SIM_DATA_PATH
 from ecoli.processes.metabolism_redux import MetabolismRedux
 from ecoli.processes.stubs.exchange_stub import Exchange
diff --git a/ecoli/experiments/metabolism_redux_sim.py b/ecoli/experiments/metabolism_redux_sim.py
index 4d45e49f7..de2bbcb01 100644
--- a/ecoli/experiments/metabolism_redux_sim.py
+++ b/ecoli/experiments/metabolism_redux_sim.py
@@ -11,7 +11,7 @@
 
 # vivarium-ecoli imports
 from ecoli.experiments.ecoli_master_sim import EcoliSim, CONFIG_DIR_PATH
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 
 
 import numpy as np
diff --git a/ecoli/experiments/tet_amp_sim.py b/ecoli/experiments/tet_amp_sim.py
index 175025228..218ea1f4a 100644
--- a/ecoli/experiments/tet_amp_sim.py
+++ b/ecoli/experiments/tet_amp_sim.py
@@ -9,7 +9,7 @@
 from ecoli.experiments.ecoli_master_sim import CONFIG_DIR_PATH, SimConfig
 from ecoli.library.parameters import param_store
 from ecoli.library.logging_tools import write_json
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 
 AVOGADRO = constants.N_A / units.mol
 
diff --git a/ecoli/library/wcecoli_state.py b/ecoli/library/json_state.py
similarity index 100%
rename from ecoli/library/wcecoli_state.py
rename to ecoli/library/json_state.py
diff --git a/ecoli/processes/antibiotics/permeability.py b/ecoli/processes/antibiotics/permeability.py
index c71d1b6e8..689ecb172 100644
--- a/ecoli/processes/antibiotics/permeability.py
+++ b/ecoli/processes/antibiotics/permeability.py
@@ -1,6 +1,6 @@
 import numpy as np
 from ecoli.library.schema import numpy_schema, bulk_name_to_idx, counts
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 from ecoli.processes.bulk_timeline import BulkTimelineProcess
 from vivarium.core.emitter import timeseries_from_data
 from vivarium.core.engine import Engine
diff --git a/migration/allocator.py b/migration/allocator.py
index 1cdb0a243..86aec933f 100644
--- a/migration/allocator.py
+++ b/migration/allocator.py
@@ -9,7 +9,7 @@
 from ecoli.processes.allocator import Allocator
 from migration import LOAD_SIM_DATA
 from migration.migration_utils import run_non_partitioned_process, recursive_compare
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 
 
 with open("data/proc_to_name.json", "r") as f:
diff --git a/migration/chromosome_replication.py b/migration/chromosome_replication.py
index 854f3e786..6209b19ad 100644
--- a/migration/chromosome_replication.py
+++ b/migration/chromosome_replication.py
@@ -4,7 +4,7 @@
 
 from vivarium.core.engine import pf
 
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 from ecoli.processes.chromosome_replication import ChromosomeReplication
 
 from migration.migration_utils import get_process_state, run_and_compare
diff --git a/migration/initial_state.py b/migration/initial_state.py
index e9a6a867e..359ad292a 100644
--- a/migration/initial_state.py
+++ b/migration/initial_state.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from ecoli.library.sim_data import LoadSimData, SIM_DATA_PATH_NO_OPERONS
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 
 
 def compare_states(viv_state, wc_state):
diff --git a/migration/migration_utils.py b/migration/migration_utils.py
index 77ce7fb2e..ba8d25e26 100644
--- a/migration/migration_utils.py
+++ b/migration/migration_utils.py
@@ -8,7 +8,7 @@
 from vivarium.library.dict_utils import deep_merge
 
 from wholecell.utils import units
-from ecoli.library.wcecoli_state import get_state_from_file
+from ecoli.library.json_state import get_state_from_file
 from migration import LOAD_SIM_DATA, LOAD_SIM_DATA_NO_OPERONS
 
 PERCENT_ERROR_THRESHOLD = 0.05

From 6f9b46cd819fcd0f56fd55b1dcb243733ac4e9b1 Mon Sep 17 00:00:00 2001
From: thalassemia <cheah_sean@yahoo.com>
Date: Tue, 2 Jul 2024 14:27:51 -0700
Subject: [PATCH 3/5] Delete old log update and division topology logic

---
 ecoli/composites/ecoli_configs/__init__.py | 15 ---------------
 ecoli/experiments/ecoli_master_sim.py      |  8 --------
 2 files changed, 23 deletions(-)

diff --git a/ecoli/composites/ecoli_configs/__init__.py b/ecoli/composites/ecoli_configs/__init__.py
index 5addb2a6b..7e9a6cd34 100644
--- a/ecoli/composites/ecoli_configs/__init__.py
+++ b/ecoli/composites/ecoli_configs/__init__.py
@@ -97,18 +97,3 @@
         )
 
     ECOLI_DEFAULT_TOPOLOGY[process] = process_topology
-
-# Add log_update ports if log_updates is True
-if config["log_updates"]:
-    for process, ports in ECOLI_DEFAULT_TOPOLOGY.items():
-        ECOLI_DEFAULT_TOPOLOGY[process]["log_update"] = (
-            "log_update",
-            process,
-        )
-
-# add division
-if config["divide"]:
-    ECOLI_DEFAULT_TOPOLOGY["division"] = {
-        "variable": ("listeners", "mass", "cell_mass"),
-        "agents": ["..", "..", "agents"],
-    }
diff --git a/ecoli/experiments/ecoli_master_sim.py b/ecoli/experiments/ecoli_master_sim.py
index 6e5554451..bf5423e1d 100644
--- a/ecoli/experiments/ecoli_master_sim.py
+++ b/ecoli/experiments/ecoli_master_sim.py
@@ -607,14 +607,6 @@ def _retrieve_topology(
                 deep_merge(process_topology, tuplify_topology(topology[process]))
             result[process] = process_topology
 
-        # Add log_update ports if log_updates is True
-        if log_updates:
-            for process, ports in result.items():
-                result[process]["log_update"] = (
-                    "log_update",
-                    process,
-                )
-
         return result
 
     def _retrieve_process_configs(

From 321f52de4800fb6b3ff30e6b48b2520514bf5302 Mon Sep 17 00:00:00 2001
From: thalassemia <cheah_sean@yahoo.com>
Date: Tue, 2 Jul 2024 18:14:29 -0700
Subject: [PATCH 4/5] Basic Jenkins tests

---
 runscripts/create_gcp_bucket.sh               |   2 -
 runscripts/debug/compare_pickles.py           | 502 ++++++++++++++++++
 runscripts/debug/diff_simouts.py              |  24 +
 runscripts/jenkins/check-reproducibility.sh   |  40 ++
 .../configs/ecoli-glucose-minimal.json        |  15 +
 .../jenkins/configs/reproducibility.json      |  10 +
 runscripts/jenkins/setup-environment.sh       |  14 +
 runscripts/jenkins/workflow.sh                |   5 +
 8 files changed, 610 insertions(+), 2 deletions(-)
 delete mode 100644 runscripts/create_gcp_bucket.sh
 create mode 100644 runscripts/debug/compare_pickles.py
 create mode 100644 runscripts/debug/diff_simouts.py
 create mode 100644 runscripts/jenkins/check-reproducibility.sh
 create mode 100644 runscripts/jenkins/configs/ecoli-glucose-minimal.json
 create mode 100644 runscripts/jenkins/configs/reproducibility.json
 create mode 100644 runscripts/jenkins/setup-environment.sh
 create mode 100644 runscripts/jenkins/workflow.sh

diff --git a/runscripts/create_gcp_bucket.sh b/runscripts/create_gcp_bucket.sh
deleted file mode 100644
index 9d5576ded..000000000
--- a/runscripts/create_gcp_bucket.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-gcloud storage buckets create gs://$0 --location=$1
diff --git a/runscripts/debug/compare_pickles.py b/runscripts/debug/compare_pickles.py
new file mode 100644
index 000000000..37c75504c
--- /dev/null
+++ b/runscripts/debug/compare_pickles.py
@@ -0,0 +1,502 @@
+"""
+Compare two .cPickle files or all the .cPickle files in a pair of directories.
+Show the differences or optionally just a count of difference lines.
+
+Usage (PATH is a path like 'out/manual/intermediates'):
+    runscripts/debug/comparePickles.py PATH1 PATH2
+"""
+import argparse
+from collections.abc import Mapping, Sequence
+import functools
+import numbers
+import os
+import pickle
+from pprint import pformat
+import re
+import sys
+import types
+from typing import Dict
+
+import Bio.Seq
+import numpy as np
+import scipy.interpolate
+import sympy
+from sympy.matrices import dense
+import unum
+
+from wholecell.utils import constants
+import wholecell.utils.unit_struct_array
+
+
+NULP = 0  # float comparison tolerance, in Number of Units in the Last Place
+
+# Objects with a list of attributes to compare
+SPECIAL_OBJECTS = {
+    scipy.interpolate.CubicSpline: ['x', 'c', 'axis'],
+    wholecell.utils.unit_struct_array.UnitStructArray: ['struct_array', 'units'],
+    }
+
+LEAF_TYPES = (
+    unum.Unum,
+    Bio.Seq.Seq,
+    sympy.Basic,
+    numbers.Number,
+    functools.partial,
+    types.FunctionType,
+    dense.MutableDenseMatrix,
+    wholecell.utils.unit_struct_array.UnitStructArray)
+
+WHITESPACE = re.compile(r'\s+')
+
+
+class Repr(object):
+    '''A Repr has the given repr() string without quotes and != any other value.'''
+    def __init__(self, repr_):
+        self.repr_ = repr_
+
+    def __repr__(self):
+        return self.repr_
+
+
+def has_python_vars(obj):
+    """
+    Returns true if the given object has any Python instance variables, that is
+    ordinary fields or compact slots. If not, it's presumably a built-in type
+    or extension type implemented entirely in C and Cython.
+    """
+    return hasattr(obj, '__dict__') or hasattr(obj, '__slots__')
+
+def all_vars(obj):
+    """
+    Returns a dict of all the object's instance variables stored in ordinary
+    fields and in compact slots. This expands on the built-in function `vars()`.
+    If the object implements the pickling method `__getstate__`, call that
+    instead to get its defining state.
+    """
+    if hasattr(obj, '__getstate__'):
+        # noinspection PyCallingNonCallable
+        return obj.__getstate__()
+
+    attrs = getattr(obj, '__dict__', {})
+    attrs.update({key: getattr(obj, key) for key in getattr(obj, '__slots__', ())})
+    return attrs
+
+def is_leaf(value, leaves=LEAF_TYPES):
+    """
+    Predicate to determine if we have reached the end of how deep we want to traverse
+    through the object tree.
+    """
+    if isinstance(value, (Mapping, Sequence)):
+        return isinstance(value, (bytes, str))
+    return (callable(value)                 # it's callable
+            or isinstance(value, leaves)    # it's an instance of a declared leaf type
+            or not has_python_vars(value))  # an object without Python instance variables
+
+def object_tree(obj, path='', debug=None):
+    """
+    Diagnostic tool to inspect a complex data structure.
+
+    Given an object, exhaustively traverse down all attributes it contains until leaves
+    are	reached, and convert everything found into a dictionary or a list. The resulting
+    dictionary will mirror the structure of the original object, but instead of
+    attributes with values it will be a dictionary where the keys are the attribute
+    names. The type of the dictionarified object will be encoded under the key `!type`
+    which is assumed to not be in conflict with any other attributes. The result should
+    aid in serialization and deserialization of the object and is intended to be a
+    translation of a pickled object.
+
+    Args:
+        obj (object): The object to inspect.
+        path (optional str): The root path of this object tree. This will be built upon
+            for each child of the current object found and reported in a value is
+            provided for `debug`.
+        debug (optional str): If provided, prints paths of the attributes encountered.
+            If the value is 'ALL', it will print every path. If the value is 'CALLABLE',
+            it will only print methods and functions it finds.
+    """
+
+    if debug == 'ALL':
+        print(path)
+
+    if is_leaf(obj):
+        if callable(obj) and (debug == 'CALLABLE'):
+            print('{}: {}'.format(path, obj))
+        return obj
+    elif isinstance(obj, Mapping):
+        return {key: object_tree(value, "{}['{}']".format(path, key), debug)
+            for (key, value) in obj.items()}
+    elif isinstance(obj, Sequence):
+        return [object_tree(subobj, "{}[{}]".format(path, index), debug) for index, subobj in enumerate(obj)]
+    else:
+        attrs = all_vars(obj)
+        tree = {key: object_tree(value, "{}.{}".format(path, key), debug)
+                for (key, value) in attrs.items()}
+        tree['!type'] = type(obj)
+
+        return tree
+
+def size_tree(o, cutoff=0.1):
+    """
+    Find the size of attributes in an object tree. Sizes greater than the cutoff
+    (in MB) will be returned for displaying. Sizes include all values contained
+    within an attribute (eg. a Dict will be represented by the size of all keys
+    and values in addition to the Dict size itself).
+
+    TODO: double check total size vs disk size - might be missing some types
+    """
+
+    def return_val(total, value):
+        if total > cutoff and value:
+            return total, value
+        else:
+            return total,
+
+    def get_size(o):
+        return sys.getsizeof(o) / 2**20  # convert to MB
+
+    size = get_size(o)
+
+    # special handling of leaf to get size of defining attributes
+    if isinstance(o, unum.Unum):
+        size += size_tree(o._unit)[0]
+        size += get_size(o._value)
+        return size,
+
+    # special handling of leaf to get size of str sequence
+    elif isinstance(o, Bio.Seq.Seq):
+        size += get_size(o._data)
+        return size,
+
+    # special handling of leaf, each entry is allocated the same amount of space
+    elif isinstance(o, wholecell.utils.unit_struct_array.UnitStructArray):
+        size += size_tree(o.units)[0]
+        n_entries = len(o.struct_array)
+        if n_entries:
+            size += get_size(o.struct_array[0]) * n_entries
+        return size,
+
+    # if a special object, check predefined attributes for equality
+    elif type(o) in SPECIAL_OBJECTS:
+        sizes = {}
+        attrs = SPECIAL_OBJECTS[type(o)]
+        for attr in attrs:
+            subsizes = size_tree(getattr(o, attr), cutoff)
+            size += subsizes[0]
+            if subsizes[0] > cutoff:
+                formatted = float('{:.2f}'.format(subsizes[0]))
+                if len(subsizes) == 1:
+                    val = formatted
+                else:
+                    val = (formatted, subsizes[1])
+                sizes[attr] = val
+        return return_val(size, sizes)
+
+    # if it is a leaf, just return the size
+    # TODO: any special handling for types that are not already accounted for above
+    elif is_leaf(o):
+        return size,
+
+    # if it is a dictionary, then get the size of keys and values
+    elif isinstance(o, Mapping):
+        sizes = {}
+        total_size = size
+        for key, value in o.items():
+            subsizes = size_tree(value, cutoff)
+            entry_size = subsizes[0] + get_size(key)
+            total_size += entry_size
+            if entry_size > cutoff:
+                formatted = float('{:.2f}'.format(entry_size))
+                if len(subsizes) == 1:
+                    val = formatted
+                else:
+                    val = (formatted, subsizes[1])
+                sizes[key] = val
+        return return_val(total_size, sizes)
+
+    # if it is a sequence, then get the size of each element
+    elif isinstance(o, Sequence):
+        sizes = []
+        total_size = size
+        for value in o:
+            subsizes = size_tree(value, cutoff)
+            total_size += subsizes[0]
+            if subsizes[0] > cutoff:
+                formatted = float('{:.2f}'.format(subsizes[0]))
+                if len(subsizes) == 1:
+                    val = formatted
+                else:
+                    val = (formatted, subsizes[1])
+                sizes.append(val)
+        return return_val(total_size, sizes)
+
+    else:
+        return size,
+
+def _are_instances_of(a, b, a_type):
+    """
+    Return True if `a` and `b` are both instances of the given type (or tuple
+    of types).
+    """
+    return isinstance(a, a_type) and isinstance(b, a_type)
+
+def diff_trees(a, b):
+    """
+    Find the differences between two trees or leaf nodes a and b. Return a
+    falsey value if the inputs match OR a truthy value that explains or
+    summarizes their differences, where each point in the tree where the inputs
+    differ will be a tuple (a's value, b's value, optional description).
+
+    Floating point numbers are compared with the tolerance set by the constant
+    NULP (Number of Units in the Last Place), allowing for NaN and infinite
+    values. (Adjust the tolerance level NULP if needed.)
+
+    This operation is symmetrical.
+    """
+
+    # treat str and Python 2 unicode as the same leaf type
+    # ditto for int and Python 2 long
+    if (_are_instances_of(a, b, str)
+            or _are_instances_of(a, b, int)):
+        if a != b:
+            return elide(a), elide(b)
+
+    # if they aren't they same type, they are clearly different. Also this lets us
+    # safely assume throughout the rest of the function that a and b are the same type
+    elif type(a) != type(b):
+        return elide(a, max_len=400), elide(b, max_len=400)
+
+    # if they are floats, handle various kinds of values
+    elif isinstance(a, float):
+        return compare_floats(a, b)
+
+    # if they are numpy arrays, compare them using a numpy testing function
+    elif isinstance(a, np.ndarray):
+        return compare_ndarrays(a, b)
+
+    # if they are Unums compare their contents with matching units
+    elif isinstance(a, unum.Unum):
+        a0, b0 = a.matchUnits(b)
+        return diff_trees(a0.asNumber(), b0.asNumber())
+
+    # if a special object, check predefined attributes for equality
+    elif type(a) in SPECIAL_OBJECTS:
+        diff = {}
+        attrs = SPECIAL_OBJECTS[type(a)]
+        for attr in attrs:
+            subdiff = diff_trees(getattr(a, attr), getattr(b, attr))
+            if subdiff:
+                diff[attr] = subdiff
+        return diff
+
+    # if they are leafs (including strings) use python equality comparison
+    elif is_leaf(a):
+        if a != b:
+            return elide(a), elide(b)
+
+    # if they are dictionaries then diff the value under each key
+    elif isinstance(a, Mapping):
+        diff = {}
+        na = Repr('--')
+        nb = Repr('--')
+        for key in set(a.keys()) | set(b.keys()):
+            subdiff = diff_trees(a.get(key, na), b.get(key, nb))
+            if subdiff:
+                diff[key] = subdiff
+        return diff
+
+    # if they are sequences then compare each element at each index
+    elif isinstance(a, Sequence):
+        if len(a) > len(b):
+            b = list(b) + (len(a) - len(b)) * [Repr('--')]
+        elif len(b) > len(a):
+            a = list(a) + (len(b) - len(a)) * [Repr('--')]
+
+        diff = []
+        for index in range(len(a)):
+            subdiff = diff_trees(a[index], b[index])
+            if subdiff:
+                diff.append(subdiff)
+        return diff
+
+    # this should never happen
+    else:
+        print('value not considered by `diff_trees`: {} {}'.format(a, b))
+
+def elide(value, max_len=200):
+    '''Return a value with the same repr but elided if it'd be longer than max.'''
+    repr_ = repr(value)
+    if len(repr_) > max_len:
+        return Repr(repr_[:max_len] + '...')
+    return value
+
+def simplify_error_message(message):
+    return elide(Repr(WHITESPACE.sub(' ', message).strip()))
+
+def compare_floats(f1, f2):
+    '''Compare two floats, allowing some tolerance, NaN, and Inf values. This
+    considers all types of NaN to match.
+    Return 0.0 (which is falsey) if they match, else (f1, f2).
+    '''
+    if f1 == f2 or np.isnan(f1) and np.isnan(f2):
+        return 0.0
+    try:
+        np.testing.assert_array_almost_equal_nulp(f1, f2, nulp=NULP)
+        return 0.0
+    except AssertionError:
+        # FWIW, the string error.args[0] tells the NULP difference.
+        return f1, f2
+
+def compare_ndarrays(array1, array2):
+    '''Compare two ndarrays, checking the shape and all elements, allowing for
+    NaN values and non-numeric values. Return () if they match, else a tuple of
+    diff info or just a diff description.
+
+    TODO(jerry): Allow tolerance for float elements of structured arrays and
+      handle NaN and Inf values.
+    '''
+
+    def summarize_array(ndarray):
+        return Repr(f"array({ndarray.shape} {ndarray.dtype})")
+
+    if array1.shape != array2.shape:
+        return summarize_array(array1), summarize_array(array2)
+
+    object_dtype = np.dtype(object)
+    if issubclass(array1.dtype.type, np.floating):
+        try:
+            # This handles float tolerance but not NaN and Inf.
+            with np.errstate(invalid='ignore'):
+                np.testing.assert_array_almost_equal_nulp(array1, array2, nulp=NULP)
+            return ()
+        except AssertionError as _:
+            # return elide(array1), elide(array2), simplify_error_message(e.args[0])
+            pass  # try again, below
+    # Handle ragged arrays created with an object dtype
+    elif array1.dtype == object_dtype and array2.dtype == object_dtype:
+        try:
+            assert array1.shape == array2.shape
+            for sub1, sub2 in zip(array1, array2):
+                np.testing.assert_equal(sub1, sub2)
+            return ()
+        except AssertionError as e:
+            return simplify_error_message(e.args[0])
+
+    try:
+        # This handles non-float dtypes, also NaN and Inf, but no tolerance.
+        np.testing.assert_array_equal(array1, array2)
+        return ()
+    except AssertionError as e:
+        return simplify_error_message(e.args[0])
+
+
+def load_tree(path):
+    """Load a .cPickle file as an object_tree."""
+    with open(path, "rb") as f:
+        data = pickle.load(f, fix_imports=True, encoding='latin1')
+    return object_tree(data)
+
+
+def load_fit_tree(out_subdir):
+    '''Load the parameter calculator's (Parca's) output as an object_tree.'''
+    # For convenience, optionally add the prefix 'out/'.
+    if not os.path.isabs(out_subdir) and not os.path.isdir(out_subdir):
+        out_subdir = os.path.join('out', out_subdir)
+
+    path = os.path.join(
+        out_subdir,
+        constants.KB_DIR,
+        constants.SERIALIZED_SIM_DATA_FILENAME)
+
+    return load_tree(path)
+
+def pprint_diffs(diffs, *, width=160, print_diff_lines=True, print_count=True):
+    '''Pretty-print the diff info: optionally print the detailed diff lines,
+    optionally print the diff line count as a single figure of merit; then
+    return the line count.
+    '''
+    if diffs:
+        diff_lines = pformat(diffs, width=width)
+        if print_diff_lines:
+            print(diff_lines)
+        line_count = len(diff_lines.strip().splitlines())
+    else:
+        line_count = 0
+
+    if print_count:
+        print('==> lines of differences: {}'.format(line_count))
+    return line_count
+
+
+def diff_files(path1: str, path2: str, print_diff_lines: bool = True) -> int:
+    """Diff the pair of named pickle files. Return the diff line count."""
+    tree1 = load_tree(path1)
+    tree2 = load_tree(path2)
+    diffs = diff_trees(tree1, tree2)
+    return pprint_diffs(diffs, print_diff_lines=print_diff_lines)
+
+
+def list_pickles(directory: str) -> Dict[str, str]:
+    """Return a map of .cPickle file names to paths in the given directory
+    sorted by file modification time then by filename."""
+    entries = [(entry.stat().st_mtime, entry.name, entry.path)
+               for entry in os.scandir(directory)
+               if entry.is_file() and entry.name.endswith('.cPickle')]
+    files = {e[1]: e[2] for e in sorted(entries)}
+    return files
+
+
+def diff_dirs(dir1: str, dir2: str, print_diff_lines: bool = True) -> int:
+    """Diff the pickle files in the pair of named directories. Return the total
+    diff line count."""
+    print(f'Comparing pickle files in "{dir1}" vs. "{dir2}".')
+    pickles1 = list_pickles(dir1)
+    pickles2 = list_pickles(dir2)
+    count = 0
+
+    for name, path1 in pickles1.items():
+        print(f'\n*** {name} {"*" * (75 - len(name))}')
+        path2 = pickles2.get(name)
+        if path2:
+            count += diff_files(path1, path2, print_diff_lines)
+        else:
+            print(f'{name} is in {dir1} but not {dir2}')
+            count += 1
+
+    only_in_dir2 = pickles2.keys() - pickles1.keys()
+    if only_in_dir2:
+        print(f'\n*** Pickle files in {dir2} but not {dir1}:\n'
+              f'{sorted(only_in_dir2)}')
+        count += len(only_in_dir2)
+
+    print(f'\n====> Total differences: {count} lines for {len(pickles1)} pickle'
+          f' files in {dir1} against {len(pickles2)} pickle files in {dir2}.')
+    return count
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Compare two .cPickle files"
+                    " or all the .cPickle files in two directories (in"
+                    " modification-time order)."
+                    " Print a count and optionally a summary of the differences.")
+    parser.add_argument('-c', '--count', action='store_true',
+        help="Print just the diff line count for each file, skipping the"
+             " detailed diff lines.")
+    parser.add_argument('-f', '--final-sim-data', action='store_true',
+        help="Append /kb/simData.cPickle to the two PATH args to make it a"
+             " little easier compare the final Parca output sim_data.")
+    parser.add_argument('path', metavar='PATH', nargs=2,
+        help="The two pickle files or directories to compare.")
+
+    args = parser.parse_args()
+    path1, path2 = args.path
+
+    if args.final_sim_data:
+        path1 = os.path.join(path1, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME)
+        path2 = os.path.join(path2, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME)
+
+    if os.path.isfile(path1):
+        diff_count = diff_files(path1, path2, print_diff_lines=not args.count)
+    else:
+        diff_count = diff_dirs(path1, path2, print_diff_lines=not args.count)
+
+    sys.exit(3 if diff_count else 0)
diff --git a/runscripts/debug/diff_simouts.py b/runscripts/debug/diff_simouts.py
new file mode 100644
index 000000000..5575d9e23
--- /dev/null
+++ b/runscripts/debug/diff_simouts.py
@@ -0,0 +1,24 @@
+import argparse
+import duckdb
+
+from ecoli.library.parquet_emitter import get_dataset_sql
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Compare simulation output in two ouput directories")
+    parser.add_argument('path', metavar='PATH', nargs=2,
+        help="The two output directories to compare.")
+
+    args = parser.parse_args()
+    path1, path2 = args.path
+
+    h_1, c_1 = get_dataset_sql(path1)
+    h_2, c_2 = get_dataset_sql(path2)
+    id_cols = "experiment_id, variant, lineage_seed, generation, agent_id, time"
+    ordered_sql = f"SELECT * FROM ({{sql_query}}) ORDER BY {id_cols}"
+    for sql_1, sql_2 in [(h_1, h_2), (c_1, c_2)]:
+        data_1 = duckdb.sql(ordered_sql.format(sql_query=sql_1)).arrow()
+        data_2 = duckdb.sql(ordered_sql.format(sql_query=sql_2)).arrow()
+        assert data_1.column_names == data_2.column_names, "Different columns."
+        for i, (col_1, col_2) in enumerate(zip(data_1, data_2)):
+            assert col_1 == col_2, f"{data_1.column_names[i]} is different."
diff --git a/runscripts/jenkins/check-reproducibility.sh b/runscripts/jenkins/check-reproducibility.sh
new file mode 100644
index 000000000..a8f7b848e
--- /dev/null
+++ b/runscripts/jenkins/check-reproducibility.sh
@@ -0,0 +1,40 @@
+#! /usr/bin/env bash
+
+# Runs the parca and sims twice to compare output to ensure both
+# are reproducible from run to run. Will exit with an error code
+# if either the parca or sim runs produce different output from
+# each other.
+
+set -eu
+
+script_dir=$(dirname $(dirname $(realpath $0)))
+out_dir=$(dirname $script_dir)/out
+dir1="check-reproducibility-1"
+dir2="check-reproducibility-2"
+git_hash=$(git rev-parse HEAD)
+
+source $script_dir/jenkins/setup-environment.sh
+
+# Run parca twice and check that output is consistent from run to run
+python $script_dir/parca.py -c 4 -o $out_dir/$dir1-$git_hash --save-intermediates
+python $script_dir/parca.py -c 4 -o $out_dir/$dir2-$git_hash --save-intermediates
+$script_dir/debug/compare_pickles.py $out_dir/$dir1-$git_hash/kb $out_dir/$dir2-$git_hash/kb
+
+# Run entire simulation for each parca output
+python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \
+    --config $script_dir/jenkins/configs/reproducibility.json \
+    --experiment_id $dir1-$git_hash --daughter_outdir $out_dir/$dir1-$git_hash \
+    --sim_data_path $out_dir/$dir1-$git_hash/kb/simData.cPickle
+python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \
+    --config $script_dir/jenkins/configs/reproducibility.json \
+    --experiment_id $dir2-$git_hash --daughter_outdir $out_dir/$dir2-$git_hash \
+    --sim_data_path $out_dir/$dir2-$git_hash/kb/simData.cPickle
+
+# Run short daughters to check that everything including division is consistent
+python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \
+    --config $script_dir/jenkins/configs/reproducibility.json \
+    --experiment_id $dir1-$git_hash --agent_id "00" --total_time 10
+python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \
+    --config $script_dir/jenkins/configs/reproducibility.json \
+    --experiment_id $dir2-$git_hash --agent_id "01" --total_time 10
+$script_dir/debug/diff_simouts.py $out_dir/$dir1-$git_hash $out_dir/$dir2-$git_hash
diff --git a/runscripts/jenkins/configs/ecoli-glucose-minimal.json b/runscripts/jenkins/configs/ecoli-glucose-minimal.json
new file mode 100644
index 000000000..e6bac3c76
--- /dev/null
+++ b/runscripts/jenkins/configs/ecoli-glucose-minimal.json
@@ -0,0 +1,15 @@
+{
+    "experiment_id": "Daily build",
+    "single_daughters": true,
+    "generations": 16,
+    "fail_at_total_time": true,
+    "emitter" : {
+        "type": "parquet",
+        "config": {
+            "out_dir": "/scratch/groups/mcovert/vecoli"
+        }
+    },
+    "analysis_options": {
+        "single": {"mass_fraction_summary": {}}
+    }
+}
diff --git a/runscripts/jenkins/configs/reproducibility.json b/runscripts/jenkins/configs/reproducibility.json
new file mode 100644
index 000000000..b169dd053
--- /dev/null
+++ b/runscripts/jenkins/configs/reproducibility.json
@@ -0,0 +1,10 @@
+{
+    "generations": 1,
+    "fail_at_total_time": true,
+    "emitter" : {
+        "type": "parquet",
+        "config": {
+            "out_dir": "out"
+        }
+    }
+}
diff --git a/runscripts/jenkins/setup-environment.sh b/runscripts/jenkins/setup-environment.sh
new file mode 100644
index 000000000..2232d1d2a
--- /dev/null
+++ b/runscripts/jenkins/setup-environment.sh
@@ -0,0 +1,14 @@
+set -e
+
+export PYTHONPATH=$PWD
+module load wcEcoli/python3 java/18.0.2 nextflow
+
+export PATH="${GROUP_HOME}/pyenv/bin:${PATH}"
+eval "$(pyenv init -)"
+eval "$(pyenv virtualenv-init -)"
+
+### Edit this line to make this branch use another pyenv
+pyenv local viv-ecoli
+pyenv activate
+
+make clean compile
diff --git a/runscripts/jenkins/workflow.sh b/runscripts/jenkins/workflow.sh
new file mode 100644
index 000000000..5435f6529
--- /dev/null
+++ b/runscripts/jenkins/workflow.sh
@@ -0,0 +1,5 @@
+set -e
+
+source runscripts/jenkins/setup-environment.sh
+
+python runscripts/workflow.py --config $1

From 5e1417e3359498b95cd75955359ab1ac6e7996ec Mon Sep 17 00:00:00 2001
From: thalassemia <cheah_sean@yahoo.com>
Date: Tue, 2 Jul 2024 21:43:56 -0700
Subject: [PATCH 5/5] Jenkins reproducibility and two gen tests

---
 runscripts/debug/compare_pickles.py           | 196 +++++++++++-------
 runscripts/debug/diff_simouts.py              |  47 +++--
 runscripts/jenkins/check-reproducibility.sh   |  26 ++-
 .../jenkins/configs/reproducibility.json      |   2 +-
 .../jenkins/configs/two_generations.json      |  19 ++
 5 files changed, 189 insertions(+), 101 deletions(-)
 create mode 100644 runscripts/jenkins/configs/two_generations.json

diff --git a/runscripts/debug/compare_pickles.py b/runscripts/debug/compare_pickles.py
index 37c75504c..1c7a7fe4d 100644
--- a/runscripts/debug/compare_pickles.py
+++ b/runscripts/debug/compare_pickles.py
@@ -5,6 +5,7 @@
 Usage (PATH is a path like 'out/manual/intermediates'):
     runscripts/debug/comparePickles.py PATH1 PATH2
 """
+
 import argparse
 from collections.abc import Mapping, Sequence
 import functools
@@ -32,9 +33,9 @@
 
 # Objects with a list of attributes to compare
 SPECIAL_OBJECTS = {
-    scipy.interpolate.CubicSpline: ['x', 'c', 'axis'],
-    wholecell.utils.unit_struct_array.UnitStructArray: ['struct_array', 'units'],
-    }
+    scipy.interpolate.CubicSpline: ["x", "c", "axis"],
+    wholecell.utils.unit_struct_array.UnitStructArray: ["struct_array", "units"],
+}
 
 LEAF_TYPES = (
     unum.Unum,
@@ -44,13 +45,15 @@
     functools.partial,
     types.FunctionType,
     dense.MutableDenseMatrix,
-    wholecell.utils.unit_struct_array.UnitStructArray)
+    wholecell.utils.unit_struct_array.UnitStructArray,
+)
 
-WHITESPACE = re.compile(r'\s+')
+WHITESPACE = re.compile(r"\s+")
 
 
 class Repr(object):
-    '''A Repr has the given repr() string without quotes and != any other value.'''
+    """A Repr has the given repr() string without quotes and != any other value."""
+
     def __init__(self, repr_):
         self.repr_ = repr_
 
@@ -64,7 +67,8 @@ def has_python_vars(obj):
     ordinary fields or compact slots. If not, it's presumably a built-in type
     or extension type implemented entirely in C and Cython.
     """
-    return hasattr(obj, '__dict__') or hasattr(obj, '__slots__')
+    return hasattr(obj, "__dict__") or hasattr(obj, "__slots__")
+
 
 def all_vars(obj):
     """
@@ -73,14 +77,15 @@ def all_vars(obj):
     If the object implements the pickling method `__getstate__`, call that
     instead to get its defining state.
     """
-    if hasattr(obj, '__getstate__'):
+    if hasattr(obj, "__getstate__"):
         # noinspection PyCallingNonCallable
         return obj.__getstate__()
 
-    attrs = getattr(obj, '__dict__', {})
-    attrs.update({key: getattr(obj, key) for key in getattr(obj, '__slots__', ())})
+    attrs = getattr(obj, "__dict__", {})
+    attrs.update({key: getattr(obj, key) for key in getattr(obj, "__slots__", ())})
     return attrs
 
+
 def is_leaf(value, leaves=LEAF_TYPES):
     """
     Predicate to determine if we have reached the end of how deep we want to traverse
@@ -88,11 +93,14 @@ def is_leaf(value, leaves=LEAF_TYPES):
     """
     if isinstance(value, (Mapping, Sequence)):
         return isinstance(value, (bytes, str))
-    return (callable(value)                 # it's callable
-            or isinstance(value, leaves)    # it's an instance of a declared leaf type
-            or not has_python_vars(value))  # an object without Python instance variables
+    return (
+        callable(value)  # it's callable
+        or isinstance(value, leaves)  # it's an instance of a declared leaf type
+        or not has_python_vars(value)
+    )  # an object without Python instance variables
+
 
-def object_tree(obj, path='', debug=None):
+def object_tree(obj, path="", debug=None):
     """
     Diagnostic tool to inspect a complex data structure.
 
@@ -115,26 +123,34 @@ def object_tree(obj, path='', debug=None):
             it will only print methods and functions it finds.
     """
 
-    if debug == 'ALL':
+    if debug == "ALL":
         print(path)
 
     if is_leaf(obj):
-        if callable(obj) and (debug == 'CALLABLE'):
-            print('{}: {}'.format(path, obj))
+        if callable(obj) and (debug == "CALLABLE"):
+            print("{}: {}".format(path, obj))
         return obj
     elif isinstance(obj, Mapping):
-        return {key: object_tree(value, "{}['{}']".format(path, key), debug)
-            for (key, value) in obj.items()}
+        return {
+            key: object_tree(value, "{}['{}']".format(path, key), debug)
+            for (key, value) in obj.items()
+        }
     elif isinstance(obj, Sequence):
-        return [object_tree(subobj, "{}[{}]".format(path, index), debug) for index, subobj in enumerate(obj)]
+        return [
+            object_tree(subobj, "{}[{}]".format(path, index), debug)
+            for index, subobj in enumerate(obj)
+        ]
     else:
         attrs = all_vars(obj)
-        tree = {key: object_tree(value, "{}.{}".format(path, key), debug)
-                for (key, value) in attrs.items()}
-        tree['!type'] = type(obj)
+        tree = {
+            key: object_tree(value, "{}.{}".format(path, key), debug)
+            for (key, value) in attrs.items()
+        }
+        tree["!type"] = type(obj)
 
         return tree
 
+
 def size_tree(o, cutoff=0.1):
     """
     Find the size of attributes in an object tree. Sizes greater than the cutoff
@@ -149,7 +165,7 @@ def return_val(total, value):
         if total > cutoff and value:
             return total, value
         else:
-            return total,
+            return (total,)
 
     def get_size(o):
         return sys.getsizeof(o) / 2**20  # convert to MB
@@ -160,12 +176,12 @@ def get_size(o):
     if isinstance(o, unum.Unum):
         size += size_tree(o._unit)[0]
         size += get_size(o._value)
-        return size,
+        return (size,)
 
     # special handling of leaf to get size of str sequence
     elif isinstance(o, Bio.Seq.Seq):
         size += get_size(o._data)
-        return size,
+        return (size,)
 
     # special handling of leaf, each entry is allocated the same amount of space
     elif isinstance(o, wholecell.utils.unit_struct_array.UnitStructArray):
@@ -173,7 +189,7 @@ def get_size(o):
         n_entries = len(o.struct_array)
         if n_entries:
             size += get_size(o.struct_array[0]) * n_entries
-        return size,
+        return (size,)
 
     # if a special object, check predefined attributes for equality
     elif type(o) in SPECIAL_OBJECTS:
@@ -183,7 +199,7 @@ def get_size(o):
             subsizes = size_tree(getattr(o, attr), cutoff)
             size += subsizes[0]
             if subsizes[0] > cutoff:
-                formatted = float('{:.2f}'.format(subsizes[0]))
+                formatted = float("{:.2f}".format(subsizes[0]))
                 if len(subsizes) == 1:
                     val = formatted
                 else:
@@ -194,7 +210,7 @@ def get_size(o):
     # if it is a leaf, just return the size
     # TODO: any special handling for types that are not already accounted for above
     elif is_leaf(o):
-        return size,
+        return (size,)
 
     # if it is a dictionary, then get the size of keys and values
     elif isinstance(o, Mapping):
@@ -205,7 +221,7 @@ def get_size(o):
             entry_size = subsizes[0] + get_size(key)
             total_size += entry_size
             if entry_size > cutoff:
-                formatted = float('{:.2f}'.format(entry_size))
+                formatted = float("{:.2f}".format(entry_size))
                 if len(subsizes) == 1:
                     val = formatted
                 else:
@@ -221,7 +237,7 @@ def get_size(o):
             subsizes = size_tree(value, cutoff)
             total_size += subsizes[0]
             if subsizes[0] > cutoff:
-                formatted = float('{:.2f}'.format(subsizes[0]))
+                formatted = float("{:.2f}".format(subsizes[0]))
                 if len(subsizes) == 1:
                     val = formatted
                 else:
@@ -230,7 +246,8 @@ def get_size(o):
         return return_val(total_size, sizes)
 
     else:
-        return size,
+        return (size,)
+
 
 def _are_instances_of(a, b, a_type):
     """
@@ -239,6 +256,7 @@ def _are_instances_of(a, b, a_type):
     """
     return isinstance(a, a_type) and isinstance(b, a_type)
 
+
 def diff_trees(a, b):
     """
     Find the differences between two trees or leaf nodes a and b. Return a
@@ -255,8 +273,7 @@ def diff_trees(a, b):
 
     # treat str and Python 2 unicode as the same leaf type
     # ditto for int and Python 2 long
-    if (_are_instances_of(a, b, str)
-            or _are_instances_of(a, b, int)):
+    if _are_instances_of(a, b, str) or _are_instances_of(a, b, int):
         if a != b:
             return elide(a), elide(b)
 
@@ -296,8 +313,8 @@ def diff_trees(a, b):
     # if they are dictionaries then diff the value under each key
     elif isinstance(a, Mapping):
         diff = {}
-        na = Repr('--')
-        nb = Repr('--')
+        na = Repr("--")
+        nb = Repr("--")
         for key in set(a.keys()) | set(b.keys()):
             subdiff = diff_trees(a.get(key, na), b.get(key, nb))
             if subdiff:
@@ -307,9 +324,9 @@ def diff_trees(a, b):
     # if they are sequences then compare each element at each index
     elif isinstance(a, Sequence):
         if len(a) > len(b):
-            b = list(b) + (len(a) - len(b)) * [Repr('--')]
+            b = list(b) + (len(a) - len(b)) * [Repr("--")]
         elif len(b) > len(a):
-            a = list(a) + (len(b) - len(a)) * [Repr('--')]
+            a = list(a) + (len(b) - len(a)) * [Repr("--")]
 
         diff = []
         for index in range(len(a)):
@@ -320,23 +337,26 @@ def diff_trees(a, b):
 
     # this should never happen
     else:
-        print('value not considered by `diff_trees`: {} {}'.format(a, b))
+        print("value not considered by `diff_trees`: {} {}".format(a, b))
+
 
 def elide(value, max_len=200):
-    '''Return a value with the same repr but elided if it'd be longer than max.'''
+    """Return a value with the same repr but elided if it'd be longer than max."""
     repr_ = repr(value)
     if len(repr_) > max_len:
-        return Repr(repr_[:max_len] + '...')
+        return Repr(repr_[:max_len] + "...")
     return value
 
+
 def simplify_error_message(message):
-    return elide(Repr(WHITESPACE.sub(' ', message).strip()))
+    return elide(Repr(WHITESPACE.sub(" ", message).strip()))
+
 
 def compare_floats(f1, f2):
-    '''Compare two floats, allowing some tolerance, NaN, and Inf values. This
+    """Compare two floats, allowing some tolerance, NaN, and Inf values. This
     considers all types of NaN to match.
     Return 0.0 (which is falsey) if they match, else (f1, f2).
-    '''
+    """
     if f1 == f2 or np.isnan(f1) and np.isnan(f2):
         return 0.0
     try:
@@ -346,14 +366,15 @@ def compare_floats(f1, f2):
         # FWIW, the string error.args[0] tells the NULP difference.
         return f1, f2
 
+
 def compare_ndarrays(array1, array2):
-    '''Compare two ndarrays, checking the shape and all elements, allowing for
+    """Compare two ndarrays, checking the shape and all elements, allowing for
     NaN values and non-numeric values. Return () if they match, else a tuple of
     diff info or just a diff description.
 
     TODO(jerry): Allow tolerance for float elements of structured arrays and
       handle NaN and Inf values.
-    '''
+    """
 
     def summarize_array(ndarray):
         return Repr(f"array({ndarray.shape} {ndarray.dtype})")
@@ -365,7 +386,7 @@ def summarize_array(ndarray):
     if issubclass(array1.dtype.type, np.floating):
         try:
             # This handles float tolerance but not NaN and Inf.
-            with np.errstate(invalid='ignore'):
+            with np.errstate(invalid="ignore"):
                 np.testing.assert_array_almost_equal_nulp(array1, array2, nulp=NULP)
             return ()
         except AssertionError as _:
@@ -392,28 +413,28 @@ def summarize_array(ndarray):
 def load_tree(path):
     """Load a .cPickle file as an object_tree."""
     with open(path, "rb") as f:
-        data = pickle.load(f, fix_imports=True, encoding='latin1')
+        data = pickle.load(f, fix_imports=True, encoding="latin1")
     return object_tree(data)
 
 
 def load_fit_tree(out_subdir):
-    '''Load the parameter calculator's (Parca's) output as an object_tree.'''
+    """Load the parameter calculator's (Parca's) output as an object_tree."""
     # For convenience, optionally add the prefix 'out/'.
     if not os.path.isabs(out_subdir) and not os.path.isdir(out_subdir):
-        out_subdir = os.path.join('out', out_subdir)
+        out_subdir = os.path.join("out", out_subdir)
 
     path = os.path.join(
-        out_subdir,
-        constants.KB_DIR,
-        constants.SERIALIZED_SIM_DATA_FILENAME)
+        out_subdir, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME
+    )
 
     return load_tree(path)
 
+
 def pprint_diffs(diffs, *, width=160, print_diff_lines=True, print_count=True):
-    '''Pretty-print the diff info: optionally print the detailed diff lines,
+    """Pretty-print the diff info: optionally print the detailed diff lines,
     optionally print the diff line count as a single figure of merit; then
     return the line count.
-    '''
+    """
     if diffs:
         diff_lines = pformat(diffs, width=width)
         if print_diff_lines:
@@ -423,7 +444,7 @@ def pprint_diffs(diffs, *, width=160, print_diff_lines=True, print_count=True):
         line_count = 0
 
     if print_count:
-        print('==> lines of differences: {}'.format(line_count))
+        print("==> lines of differences: {}".format(line_count))
     return line_count
 
 
@@ -438,9 +459,11 @@ def diff_files(path1: str, path2: str, print_diff_lines: bool = True) -> int:
 def list_pickles(directory: str) -> Dict[str, str]:
     """Return a map of .cPickle file names to paths in the given directory
     sorted by file modification time then by filename."""
-    entries = [(entry.stat().st_mtime, entry.name, entry.path)
-               for entry in os.scandir(directory)
-               if entry.is_file() and entry.name.endswith('.cPickle')]
+    entries = [
+        (entry.stat().st_mtime, entry.name, entry.path)
+        for entry in os.scandir(directory)
+        if entry.is_file() and entry.name.endswith(".cPickle")
+    ]
     files = {e[1]: e[2] for e in sorted(entries)}
     return files
 
@@ -459,40 +482,61 @@ def diff_dirs(dir1: str, dir2: str, print_diff_lines: bool = True) -> int:
         if path2:
             count += diff_files(path1, path2, print_diff_lines)
         else:
-            print(f'{name} is in {dir1} but not {dir2}')
+            print(f"{name} is in {dir1} but not {dir2}")
             count += 1
 
     only_in_dir2 = pickles2.keys() - pickles1.keys()
     if only_in_dir2:
-        print(f'\n*** Pickle files in {dir2} but not {dir1}:\n'
-              f'{sorted(only_in_dir2)}')
+        print(
+            f"\n*** Pickle files in {dir2} but not {dir1}:\n" f"{sorted(only_in_dir2)}"
+        )
         count += len(only_in_dir2)
 
-    print(f'\n====> Total differences: {count} lines for {len(pickles1)} pickle'
-          f' files in {dir1} against {len(pickles2)} pickle files in {dir2}.')
+    print(
+        f"\n====> Total differences: {count} lines for {len(pickles1)} pickle"
+        f" files in {dir1} against {len(pickles2)} pickle files in {dir2}."
+    )
     return count
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Compare two .cPickle files"
-                    " or all the .cPickle files in two directories (in"
-                    " modification-time order)."
-                    " Print a count and optionally a summary of the differences.")
-    parser.add_argument('-c', '--count', action='store_true',
+        " or all the .cPickle files in two directories (in"
+        " modification-time order)."
+        " Print a count and optionally a summary of the differences."
+    )
+    parser.add_argument(
+        "-c",
+        "--count",
+        action="store_true",
         help="Print just the diff line count for each file, skipping the"
-             " detailed diff lines.")
-    parser.add_argument('-f', '--final-sim-data', action='store_true',
+        " detailed diff lines.",
+    )
+    parser.add_argument(
+        "-f",
+        "--final-sim-data",
+        action="store_true",
         help="Append /kb/simData.cPickle to the two PATH args to make it a"
-             " little easier compare the final Parca output sim_data.")
-    parser.add_argument('path', metavar='PATH', nargs=2,
-        help="The two pickle files or directories to compare.")
+        " little easier compare the final Parca output sim_data.",
+    )
+    parser.add_argument(
+        "path",
+        metavar="PATH",
+        nargs=2,
+        help="The two pickle files or directories to compare.",
+    )
 
     args = parser.parse_args()
     path1, path2 = args.path
 
     if args.final_sim_data:
-        path1 = os.path.join(path1, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME)
-        path2 = os.path.join(path2, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME)
+        path1 = os.path.join(
+            path1, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME
+        )
+        path2 = os.path.join(
+            path2, constants.KB_DIR, constants.SERIALIZED_SIM_DATA_FILENAME
+        )
 
     if os.path.isfile(path1):
         diff_count = diff_files(path1, path2, print_diff_lines=not args.count)
diff --git a/runscripts/debug/diff_simouts.py b/runscripts/debug/diff_simouts.py
index 5575d9e23..5ad4b74cf 100644
--- a/runscripts/debug/diff_simouts.py
+++ b/runscripts/debug/diff_simouts.py
@@ -1,24 +1,43 @@
 import argparse
 import duckdb
+import numpy as np
 
-from ecoli.library.parquet_emitter import get_dataset_sql
+from ecoli.library.parquet_emitter import get_dataset_sql, ndlist_to_ndarray
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Compare simulation output in two ouput directories")
-    parser.add_argument('path', metavar='PATH', nargs=2,
-        help="The two output directories to compare.")
+        description="Compare simulation output of two experiment IDs."
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Output directory containing data for two experiment IDs.",
+    )
+    parser.add_argument(
+        "exp_ids", metavar="EXP_ID", nargs=2, help="The two experiment IDs to compare."
+    )
 
     args = parser.parse_args()
-    path1, path2 = args.path
+    exp_id_1, exp_id_2 = args.exp_ids
 
-    h_1, c_1 = get_dataset_sql(path1)
-    h_2, c_2 = get_dataset_sql(path2)
+    history_sql, _ = get_dataset_sql(args.output)
     id_cols = "experiment_id, variant, lineage_seed, generation, agent_id, time"
-    ordered_sql = f"SELECT * FROM ({{sql_query}}) ORDER BY {id_cols}"
-    for sql_1, sql_2 in [(h_1, h_2), (c_1, c_2)]:
-        data_1 = duckdb.sql(ordered_sql.format(sql_query=sql_1)).arrow()
-        data_2 = duckdb.sql(ordered_sql.format(sql_query=sql_2)).arrow()
-        assert data_1.column_names == data_2.column_names, "Different columns."
-        for i, (col_1, col_2) in enumerate(zip(data_1, data_2)):
-            assert col_1 == col_2, f"{data_1.column_names[i]} is different."
+    ordered_sql = f"SELECT * FROM ({{sql_query}}) WHERE experiment_id = '{{exp_id}}' ORDER BY {id_cols}"
+    data_1 = duckdb.sql(
+        ordered_sql.format(sql_query=history_sql, exp_id=exp_id_1)
+    ).arrow()
+    data_2 = duckdb.sql(
+        ordered_sql.format(sql_query=history_sql, exp_id=exp_id_2)
+    ).arrow()
+    assert data_1.column_names == data_2.column_names, "Different columns."
+    for i, (col_1, col_2) in enumerate(zip(data_1, data_2)):
+        if col_1 != col_2 and data_1.column_names[i] not in [
+            "experiment_id",
+            "agent_id",
+        ]:
+            np.testing.assert_allclose(
+                ndlist_to_ndarray(col_1),
+                ndlist_to_ndarray(col_2),
+                atol=1e-12,
+                err_msg=f"{data_1.column_names[i]} not equal",
+            )
diff --git a/runscripts/jenkins/check-reproducibility.sh b/runscripts/jenkins/check-reproducibility.sh
index a8f7b848e..1115a2f96 100644
--- a/runscripts/jenkins/check-reproducibility.sh
+++ b/runscripts/jenkins/check-reproducibility.sh
@@ -9,6 +9,7 @@ set -eu
 
 script_dir=$(dirname $(dirname $(realpath $0)))
 out_dir=$(dirname $script_dir)/out
+data_dir=$(dirname $script_dir)/data
 dir1="check-reproducibility-1"
 dir2="check-reproducibility-2"
 git_hash=$(git rev-parse HEAD)
@@ -16,25 +17,30 @@ git_hash=$(git rev-parse HEAD)
 source $script_dir/jenkins/setup-environment.sh
 
 # Run parca twice and check that output is consistent from run to run
-python $script_dir/parca.py -c 4 -o $out_dir/$dir1-$git_hash --save-intermediates
-python $script_dir/parca.py -c 4 -o $out_dir/$dir2-$git_hash --save-intermediates
-$script_dir/debug/compare_pickles.py $out_dir/$dir1-$git_hash/kb $out_dir/$dir2-$git_hash/kb
+python $script_dir/parca.py -c 4 -o "$out_dir/$dir1-$git_hash"
+python $script_dir/parca.py -c 4 -o "$out_dir/$dir2-$git_hash"
+python $script_dir/debug/compare_pickles.py "$out_dir/$dir1-$git_hash/kb" "$out_dir/$dir2-$git_hash/kb"
 
 # Run entire simulation for each parca output
 python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \
     --config $script_dir/jenkins/configs/reproducibility.json \
-    --experiment_id $dir1-$git_hash --daughter_outdir $out_dir/$dir1-$git_hash \
-    --sim_data_path $out_dir/$dir1-$git_hash/kb/simData.cPickle
+    --experiment_id "$dir1-$git_hash" --daughter_outdir "$out_dir/$dir1-$git_hash" \
+    --sim_data_path "$out_dir/$dir1-$git_hash/kb/simData.cPickle" --fail_at_total_time
 python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \
     --config $script_dir/jenkins/configs/reproducibility.json \
-    --experiment_id $dir2-$git_hash --daughter_outdir $out_dir/$dir2-$git_hash \
-    --sim_data_path $out_dir/$dir2-$git_hash/kb/simData.cPickle
+    --experiment_id "$dir2-$git_hash" --daughter_outdir "$out_dir/$dir2-$git_hash" \
+    --sim_data_path "$out_dir/$dir2-$git_hash/kb/simData.cPickle" --fail_at_total_time
 
 # Run short daughters to check that everything including division is consistent
+ln -s "$out_dir/$dir1-$git_hash/daughter_state_0.json" $data_dir/daughter_state_0.json
 python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \
     --config $script_dir/jenkins/configs/reproducibility.json \
-    --experiment_id $dir1-$git_hash --agent_id "00" --total_time 10
+    --experiment_id "$dir1-$git_hash" --agent_id "00" --total_time 10 \
+    --initial_state_file daughter_state_0
+rm $data_dir/daughter_state_0.json
+ln -s "$out_dir/$dir2-$git_hash/daughter_state_0.json" $data_dir/daughter_state_0.json
 python $(dirname $script_dir)/ecoli/experiments/ecoli_master_sim.py \
     --config $script_dir/jenkins/configs/reproducibility.json \
-    --experiment_id $dir2-$git_hash --agent_id "01" --total_time 10
-$script_dir/debug/diff_simouts.py $out_dir/$dir1-$git_hash $out_dir/$dir2-$git_hash
+    --experiment_id "$dir2-$git_hash" --agent_id "00" --total_time 10 \
+    --initial_state_file daughter_state_0
+python $script_dir/debug/diff_simouts.py -o "$out_dir" "$dir1-$git_hash" "$dir2-$git_hash"
diff --git a/runscripts/jenkins/configs/reproducibility.json b/runscripts/jenkins/configs/reproducibility.json
index b169dd053..ccf6dfe4b 100644
--- a/runscripts/jenkins/configs/reproducibility.json
+++ b/runscripts/jenkins/configs/reproducibility.json
@@ -1,6 +1,6 @@
 {
+    "suffix_time": false,
     "generations": 1,
-    "fail_at_total_time": true,
     "emitter" : {
         "type": "parquet",
         "config": {
diff --git a/runscripts/jenkins/configs/two_generations.json b/runscripts/jenkins/configs/two_generations.json
new file mode 100644
index 000000000..7563f93cf
--- /dev/null
+++ b/runscripts/jenkins/configs/two_generations.json
@@ -0,0 +1,19 @@
+{
+    "experiment_id": "two_generations",
+    "suffix_time": false,
+    "parca_options": {
+        "cpus": 3
+    },
+    "generations": 2,
+    "n_init_sims": 1,
+    "single_daughters": true,
+    "emitter" : {
+        "type": "parquet",
+        "config": {
+            "out_dir": "out"
+        }
+    },
+    "analysis_options": {
+        "single": {"mass_fraction_summary": {}}
+    }
+}