From 632cde71419259d48c9b725426e8a49b89c0d59e Mon Sep 17 00:00:00 2001
From: Jerry Morrison <1fish2@users.noreply.github.com>
Date: Thu, 22 Jul 2021 12:35:07 -0700
Subject: [PATCH] independent km caches for distinct cases

Put a checksum into the `KmcountsCached` cache filename so different cases get independent cache files, e.g. when switching git branches, Parca options during parameter optimization, or mono/polycistronic operons.

This renames the cache file from `fixtures/endo_km/km3.cPickle` to `parca-km-1918837868.cPickle`, for instance.

Q. Does anyone prefer the "fixtures" directory name?

The cache files `cache/parca-km-*.cPickle` will accumulate until `make clean`.

Does this succeed in distinguishing current cases?

We could make this more sensitive by checksumming more inputs or less picky by rounding `Kmcounts.astype(np.float16)`.

See #1123
---
 Makefile                               |  2 +-
 reconstruction/ecoli/fit_sim_data_1.py | 22 +++++++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index c8a3d3b441..6dbf3e86a2 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ compile:
 #  write_ode_file.py in Parca code.
 # Fireworks writes launcher_20* and block_20*.
 clean:
-	rm -fr fixtures
+	rm -fr fixtures cache
 	(cd reconstruction/ecoli/dataclasses/process && rm -f equilibrium_odes.py two_component_system_odes*.py)
 	find . -name "*.pyc" -exec rm -rf {} \;
 	find . -name "*.o" -exec rm -fr {} \;
diff --git a/reconstruction/ecoli/fit_sim_data_1.py b/reconstruction/ecoli/fit_sim_data_1.py
index d105bb9530..663f7aea00 100644
--- a/reconstruction/ecoli/fit_sim_data_1.py
+++ b/reconstruction/ecoli/fit_sim_data_1.py
@@ -5,8 +5,7 @@
 TODO: functionalize so that values are not both set and returned from some methods
 """
 
-from __future__ import absolute_import, division, print_function
-
+import binascii
 import functools
 import itertools
 import os
@@ -26,7 +25,6 @@
 from wholecell.containers.bulk_objects_container import BulkObjectsContainer
 from wholecell.utils import filepath, parallelization, units
 from wholecell.utils.fitting import normalize, masses_and_counts_for_homeostatic_target
-from wholecell.utils import parallelization
 
 
 # Fitting parameters
@@ -3176,6 +3174,13 @@ def calculateRnapRecruitment(sim_data, cell_specs):
 		}
 
 
+def crc32(arr: np.ndarray) -> int:
+	"""Return a CRC32 checksum of an ndarray."""
+	shape = str(arr.shape).encode()
+	values = arr.tobytes()
+	return binascii.crc32(shape + values)
+
+
 def setKmCooperativeEndoRNonLinearRNAdecay(sim_data, bulkContainer):
 	"""
 	Fits the affinities (Michaelis-Menten constants) for RNAs binding to endoRNAses.
@@ -3324,13 +3329,12 @@ def setKmCooperativeEndoRNonLinearRNAdecay(sim_data, bulkContainer):
 				alpha
 			)
 
+	# The checksum in the filename picks independent caches for distinct cases
+	# such as different Parca options or Parca code in different git branches.
+	# `make clean` will delete the cache files.
 	needToUpdate = False
-	fixturesDir = filepath.makedirs(filepath.ROOT_PATH, "fixtures", "endo_km")
-	# Numpy 'U' fields make these files incompatible with older code, so change
-	# the filename. No need to make files compatible between Python 2 & 3; we'd
-	# have to set the same protocol version and set Python 3-only args like
-	# encoding='latin1'.
-	km_filepath = os.path.join(fixturesDir, 'km{}.cPickle'.format(sys.version_info[0]))
+	cache_dir = filepath.makedirs(filepath.ROOT_PATH, "cache")
+	km_filepath = os.path.join(cache_dir, f'parca-km-{crc32(Kmcounts)}.cPickle')
 
 	if os.path.exists(km_filepath):
 		with open(km_filepath, "rb") as f: