From aaf70a58763dc9f214839eb9ac43933e975d8be1 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Thu, 22 Aug 2024 12:46:47 +0000
Subject: [PATCH 01/13] remove a config import that is unused

this test checks that memoization works with all configs

test design: that's not necessarily true in future but it should be now.
in future perhaps a memoizer option would be "NoMemoizer" which would
not do *any* memoization, not even in-memory-only? but I think its ok
to not do that for now and i think its ok to require that a memoizer
always does actually do memoization at a memory level (so you can't
avoid it...)
---
 parsl/tests/test_python_apps/test_memoize_1.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/parsl/tests/test_python_apps/test_memoize_1.py b/parsl/tests/test_python_apps/test_memoize_1.py
index e0d03a6ebf..0d5f85f490 100644
--- a/parsl/tests/test_python_apps/test_memoize_1.py
+++ b/parsl/tests/test_python_apps/test_memoize_1.py
@@ -2,7 +2,6 @@
 
 import parsl
 from parsl.app.app import python_app
-from parsl.tests.configs.local_threads import config
 
 
 @python_app(cache=True)

From 84a66b3542a007989fa5d6874ce9e17ed37ae1cd Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Thu, 22 Aug 2024 12:19:13 +0000
Subject: [PATCH 02/13] delete duplicate/subtest

this test checks that checkpoint dir exists

checkpoint_1 checks that files exist within the checkpoint dir
---
 .../test_python_checkpoint_3.py               | 42 -------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 parsl/tests/test_checkpointing/test_python_checkpoint_3.py

diff --git a/parsl/tests/test_checkpointing/test_python_checkpoint_3.py b/parsl/tests/test_checkpointing/test_python_checkpoint_3.py
deleted file mode 100644
index 1a02dd7a74..0000000000
--- a/parsl/tests/test_checkpointing/test_python_checkpoint_3.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import os
-
-import pytest
-
-import parsl
-from parsl.app.app import python_app
-from parsl.tests.configs.local_threads import config
-
-
-def local_setup():
-    global dfk
-    dfk = parsl.load(config)
-
-
-def local_teardown():
-    parsl.dfk().cleanup()
-
-
-@python_app
-def slow_double(x, sleep_dur=1, cache=True):
-    import time
-    time.sleep(sleep_dur)
-    return x * 2
-
-
-@pytest.mark.local
-def test_checkpointing():
-    """Testing code snippet from documentation
-    """
-
-    N = 5  # Number of calls to slow_double
-    d = []  # List to store the futures
-    for i in range(0, N):
-        d.append(slow_double(i))
-
-    # Wait for the results
-    [i.result() for i in d]
-
-    checkpoint_dir = dfk.checkpoint()
-    print(checkpoint_dir)
-
-    assert os.path.exists(checkpoint_dir), "Checkpoint dir does not exist"

From fa4b7a59308c37e991b041932f619cd25de210d2 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Thu, 22 Aug 2024 10:07:30 +0000
Subject: [PATCH 03/13] remove unused checkpoint return value

---
 docs/userguide/checkpoints.rst                        |  3 +--
 parsl/dataflow/dflow.py                               |  4 +---
 .../test_checkpointing/test_python_checkpoint_1.py    | 11 +++++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/userguide/checkpoints.rst b/docs/userguide/checkpoints.rst
index 8867107b7a..3607cdee9f 100644
--- a/docs/userguide/checkpoints.rst
+++ b/docs/userguide/checkpoints.rst
@@ -264,8 +264,7 @@ of the ``slow_double`` app.
     # Wait for the results
     [i.result() for i in d]
 
-    cpt_dir = dfk.checkpoint()
-    print(cpt_dir)  # Prints the checkpoint dir
+    dfk.checkpoint()
 
 
 Resuming from a checkpoint
diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
index 344173c4b1..0595e4c963 100644
--- a/parsl/dataflow/dflow.py
+++ b/parsl/dataflow/dflow.py
@@ -1323,7 +1323,7 @@ def cleanup(self) -> None:
 
         logger.info("DFK cleanup complete")
 
-    def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
+    def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> None:
         """Checkpoint the dfk incrementally to a checkpoint file.
 
         When called, every task that has been completed yet not
@@ -1391,8 +1391,6 @@ def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
             else:
                 logger.info("Done checkpointing {} tasks".format(count))
 
-            return checkpoint_dir
-
     def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
         """Load a checkpoint file into a lookup table.
 
diff --git a/parsl/tests/test_checkpointing/test_python_checkpoint_1.py b/parsl/tests/test_checkpointing/test_python_checkpoint_1.py
index 7539af7f24..95a31c943a 100644
--- a/parsl/tests/test_checkpointing/test_python_checkpoint_1.py
+++ b/parsl/tests/test_checkpointing/test_python_checkpoint_1.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 
 import pytest
 
@@ -20,15 +21,17 @@ def uuid_app():
 
 
 @pytest.mark.local
-def test_initial_checkpoint_write():
+def test_initial_checkpoint_write() -> None:
     """1. Launch a few apps and write the checkpoint once a few have completed
     """
     uuid_app().result()
 
-    cpt_dir = parsl.dfk().checkpoint()
+    parsl.dfk().checkpoint()
 
-    cptpath = cpt_dir + '/dfk.pkl'
+    cpt_dir = Path(parsl.dfk().run_dir) / 'checkpoint'
+
+    cptpath = cpt_dir / 'dfk.pkl'
     assert os.path.exists(cptpath), f"DFK checkpoint missing: {cptpath}"
 
-    cptpath = cpt_dir + '/tasks.pkl'
+    cptpath = cpt_dir / 'tasks.pkl'
     assert os.path.exists(cptpath), f"Tasks checkpoint missing: {cptpath}"

From b9b2a6fc485863133774138f36e13a604a364db4 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Fri, 19 Jul 2024 16:46:00 +0000
Subject: [PATCH 04/13] make checkpoint call not use dfk state, in prep for
 moving into memoizer

---
 parsl/dataflow/dflow.py                       | 24 ++++++++++++-------
 .../test_python_checkpoint_1.py               |  2 +-
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
index 0595e4c963..0d9bf7f850 100644
--- a/parsl/dataflow/dflow.py
+++ b/parsl/dataflow/dflow.py
@@ -176,6 +176,8 @@ def __init__(self, config: Config) -> None:
         self.checkpointed_tasks = 0
         self._checkpoint_timer = None
         self.checkpoint_mode = config.checkpoint_mode
+
+        self._modify_checkpointable_tasks_lock = threading.Lock()
         self.checkpointable_tasks: List[TaskRecord] = []
 
         # this must be set before executors are added since add_executors calls
@@ -200,7 +202,7 @@ def __init__(self, config: Config) -> None:
                 except Exception:
                     raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period))
                 checkpoint_period = (h * 3600) + (m * 60) + s
-                self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
+                self._checkpoint_timer = Timer(self.invoke_checkpoint, interval=checkpoint_period, name="Checkpoint")
 
         self.task_count = 0
         self.tasks: Dict[int, TaskRecord] = {}
@@ -571,7 +573,7 @@ def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None:
         if self.checkpoint_mode == 'task_exit':
             self.checkpoint(tasks=[task_record])
         elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
-            with self.checkpoint_lock:
+            with self._modify_checkpointable_tasks_lock:
                 self.checkpointable_tasks.append(task_record)
         elif self.checkpoint_mode is None:
             pass
@@ -1250,7 +1252,10 @@ def cleanup(self) -> None:
         # Checkpointing takes priority over the rest of the tasks
         # checkpoint if any valid checkpoint method is specified
         if self.checkpoint_mode is not None:
-            self.checkpoint()
+
+            # TODO: accesses to self.checkpointable_tasks should happen
+            # under a lock?
+            self.checkpoint(self.checkpointable_tasks)
 
             if self._checkpoint_timer:
                 logger.info("Stopping checkpoint timer")
@@ -1323,7 +1328,12 @@ def cleanup(self) -> None:
 
         logger.info("DFK cleanup complete")
 
-    def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> None:
+    def invoke_checkpoint(self) -> None:
+        with self._modify_checkpointable_tasks_lock:
+            self.checkpoint(self.checkpointable_tasks)
+            self.checkpointable_tasks = []
+
+    def checkpoint(self, tasks: Sequence[TaskRecord]) -> None:
         """Checkpoint the dfk incrementally to a checkpoint file.
 
         When called, every task that has been completed yet not
@@ -1342,11 +1352,7 @@ def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> None:
             run under RUNDIR/checkpoints/{tasks.pkl, dfk.pkl}
         """
         with self.checkpoint_lock:
-            if tasks:
-                checkpoint_queue = tasks
-            else:
-                checkpoint_queue = self.checkpointable_tasks
-                self.checkpointable_tasks = []
+            checkpoint_queue = tasks
 
             checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
             checkpoint_dfk = checkpoint_dir + '/dfk.pkl'
diff --git a/parsl/tests/test_checkpointing/test_python_checkpoint_1.py b/parsl/tests/test_checkpointing/test_python_checkpoint_1.py
index 95a31c943a..f70f37f9c7 100644
--- a/parsl/tests/test_checkpointing/test_python_checkpoint_1.py
+++ b/parsl/tests/test_checkpointing/test_python_checkpoint_1.py
@@ -26,7 +26,7 @@ def test_initial_checkpoint_write() -> None:
     """
     uuid_app().result()
 
-    parsl.dfk().checkpoint()
+    parsl.dfk().invoke_checkpoint()
 
     cpt_dir = Path(parsl.dfk().run_dir) / 'checkpoint'
 

From 4594fc9b06547c9a716476492e0c480b9bad8fd6 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Fri, 19 Jul 2024 16:58:39 +0000
Subject: [PATCH 05/13] loadchcekpoints in memoizer

---
 parsl/dataflow/dflow.py       | 78 +++-------------------------------
 parsl/dataflow/memoization.py | 80 ++++++++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 75 deletions(-)

diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
index 0d9bf7f850..70f8be29c2 100644
--- a/parsl/dataflow/dflow.py
+++ b/parsl/dataflow/dflow.py
@@ -30,7 +30,7 @@
 from parsl.data_provider.data_manager import DataManager
 from parsl.data_provider.files import File
 from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
-from parsl.dataflow.errors import BadCheckpoint, DependencyError, JoinError
+from parsl.dataflow.errors import DependencyError, JoinError
 from parsl.dataflow.futures import AppFuture
 from parsl.dataflow.memoization import Memoizer
 from parsl.dataflow.rundirs import make_rundir
@@ -166,13 +166,13 @@ def __init__(self, config: Config) -> None:
                                  workflow_info)
 
         if config.checkpoint_files is not None:
-            checkpoints = self.load_checkpoints(config.checkpoint_files)
+            checkpoint_files = config.checkpoint_files
         elif config.checkpoint_files is None and config.checkpoint_mode is not None:
-            checkpoints = self.load_checkpoints(get_all_checkpoints(self.run_dir))
+            checkpoint_files = get_all_checkpoints(self.run_dir)
         else:
-            checkpoints = {}
+            checkpoint_files = []
 
-        self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint=checkpoints)
+        self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
         self.checkpointed_tasks = 0
         self._checkpoint_timer = None
         self.checkpoint_mode = config.checkpoint_mode
@@ -1397,74 +1397,6 @@ def checkpoint(self, tasks: Sequence[TaskRecord]) -> None:
             else:
                 logger.info("Done checkpointing {} tasks".format(count))
 
-    def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
-        """Load a checkpoint file into a lookup table.
-
-        The data being loaded from the pickle file mostly contains input
-        attributes of the task: func, args, kwargs, env...
-        To simplify the check of whether the exact task has been completed
-        in the checkpoint, we hash these input params and use it as the key
-        for the memoized lookup table.
-
-        Args:
-            - checkpointDirs (list) : List of filepaths to checkpoints
-              Eg. ['runinfo/001', 'runinfo/002']
-
-        Returns:
-            - memoized_lookup_table (dict)
-        """
-        memo_lookup_table = {}
-
-        for checkpoint_dir in checkpointDirs:
-            logger.info("Loading checkpoints from {}".format(checkpoint_dir))
-            checkpoint_file = os.path.join(checkpoint_dir, 'tasks.pkl')
-            try:
-                with open(checkpoint_file, 'rb') as f:
-                    while True:
-                        try:
-                            data = pickle.load(f)
-                            # Copy and hash only the input attributes
-                            memo_fu: Future = Future()
-                            assert data['exception'] is None
-                            memo_fu.set_result(data['result'])
-                            memo_lookup_table[data['hash']] = memo_fu
-
-                        except EOFError:
-                            # Done with the checkpoint file
-                            break
-            except FileNotFoundError:
-                reason = "Checkpoint file was not found: {}".format(
-                    checkpoint_file)
-                logger.error(reason)
-                raise BadCheckpoint(reason)
-            except Exception:
-                reason = "Failed to load checkpoint: {}".format(
-                    checkpoint_file)
-                logger.error(reason)
-                raise BadCheckpoint(reason)
-
-            logger.info("Completed loading checkpoint: {0} with {1} tasks".format(checkpoint_file,
-                                                                                  len(memo_lookup_table.keys())))
-        return memo_lookup_table
-
-    @typeguard.typechecked
-    def load_checkpoints(self, checkpointDirs: Optional[Sequence[str]]) -> Dict[str, Future]:
-        """Load checkpoints from the checkpoint files into a dictionary.
-
-        The results are used to pre-populate the memoizer's lookup_table
-
-        Kwargs:
-             - checkpointDirs (list) : List of run folder to use as checkpoints
-               Eg. ['runinfo/001', 'runinfo/002']
-
-        Returns:
-             - dict containing, hashed -> future mappings
-        """
-        if checkpointDirs:
-            return self._load_checkpoints(checkpointDirs)
-        else:
-            return {}
-
     @staticmethod
     def _log_std_streams(task_record: TaskRecord) -> None:
         tid = task_record['id']
diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py
index 551bd0b9d4..1915b3535a 100644
--- a/parsl/dataflow/memoization.py
+++ b/parsl/dataflow/memoization.py
@@ -2,10 +2,14 @@
 
 import hashlib
 import logging
+import os
 import pickle
 from functools import lru_cache, singledispatch
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
 
+import typeguard
+
+from parsl.dataflow.errors import BadCheckpoint
 from parsl.dataflow.taskrecord import TaskRecord
 
 if TYPE_CHECKING:
@@ -146,7 +150,7 @@ class Memoizer:
 
     """
 
-    def __init__(self, dfk: DataFlowKernel, memoize: bool = True, checkpoint: Dict[str, Future[Any]] = {}):
+    def __init__(self, dfk: DataFlowKernel, *, memoize: bool = True, checkpoint_files: Sequence[str]):
         """Initialize the memoizer.
 
         Args:
@@ -159,6 +163,10 @@ def __init__(self, dfk: DataFlowKernel, memoize: bool = True, checkpoint: Dict[s
         self.dfk = dfk
         self.memoize = memoize
 
+        # TODO: we always load checkpoints even if we then discard them...
+        # this is more obvious here, less obvious in previous Parsl...
+        checkpoint = self.load_checkpoints(checkpoint_files)
+
         if self.memoize:
             logger.info("App caching initialized")
             self.memo_lookup_table = checkpoint
@@ -274,3 +282,71 @@ def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
         else:
             logger.debug(f"Storing app cache entry {task['hashsum']} with result from task {task_id}")
         self.memo_lookup_table[task['hashsum']] = r
+
+    def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
+        """Load a checkpoint file into a lookup table.
+
+        The data being loaded from the pickle file mostly contains input
+        attributes of the task: func, args, kwargs, env...
+        To simplify the check of whether the exact task has been completed
+        in the checkpoint, we hash these input params and use it as the key
+        for the memoized lookup table.
+
+        Args:
+            - checkpointDirs (list) : List of filepaths to checkpoints
+              Eg. ['runinfo/001', 'runinfo/002']
+
+        Returns:
+            - memoized_lookup_table (dict)
+        """
+        memo_lookup_table = {}
+
+        for checkpoint_dir in checkpointDirs:
+            logger.info("Loading checkpoints from {}".format(checkpoint_dir))
+            checkpoint_file = os.path.join(checkpoint_dir, 'tasks.pkl')
+            try:
+                with open(checkpoint_file, 'rb') as f:
+                    while True:
+                        try:
+                            data = pickle.load(f)
+                            # Copy and hash only the input attributes
+                            memo_fu: Future = Future()
+                            assert data['exception'] is None
+                            memo_fu.set_result(data['result'])
+                            memo_lookup_table[data['hash']] = memo_fu
+
+                        except EOFError:
+                            # Done with the checkpoint file
+                            break
+            except FileNotFoundError:
+                reason = "Checkpoint file was not found: {}".format(
+                    checkpoint_file)
+                logger.error(reason)
+                raise BadCheckpoint(reason)
+            except Exception:
+                reason = "Failed to load checkpoint: {}".format(
+                    checkpoint_file)
+                logger.error(reason)
+                raise BadCheckpoint(reason)
+
+            logger.info("Completed loading checkpoint: {0} with {1} tasks".format(checkpoint_file,
+                                                                                  len(memo_lookup_table.keys())))
+        return memo_lookup_table
+
+    @typeguard.typechecked
+    def load_checkpoints(self, checkpointDirs: Optional[Sequence[str]]) -> Dict[str, Future]:
+        """Load checkpoints from the checkpoint files into a dictionary.
+
+        The results are used to pre-populate the memoizer's lookup_table
+
+        Kwargs:
+             - checkpointDirs (list) : List of run folder to use as checkpoints
+               Eg. ['runinfo/001', 'runinfo/002']
+
+        Returns:
+             - dict containing, hashed -> future mappings
+        """
+        if checkpointDirs:
+            return self._load_checkpoints(checkpointDirs)
+        else:
+            return {}

From 40c78fe75bd65f45f92f771368f8568299a77d3e Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Fri, 19 Jul 2024 17:07:23 +0000
Subject: [PATCH 06/13] dev

---
 parsl/dataflow/dflow.py       | 76 +++--------------------------------
 parsl/dataflow/memoization.py | 73 +++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 71 deletions(-)

diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
index 70f8be29c2..1253b86a80 100644
--- a/parsl/dataflow/dflow.py
+++ b/parsl/dataflow/dflow.py
@@ -7,7 +7,6 @@
 import logging
 import os
 import pathlib
-import pickle
 import random
 import sys
 import threading
@@ -99,8 +98,6 @@ def __init__(self, config: Config) -> None:
 
         logger.info("Parsl version: {}".format(get_version()))
 
-        self.checkpoint_lock = threading.Lock()
-
         self.usage_tracker = UsageTracker(self)
         self.usage_tracker.send_start_message()
 
@@ -173,7 +170,8 @@ def __init__(self, config: Config) -> None:
             checkpoint_files = []
 
         self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
-        self.checkpointed_tasks = 0
+        self.memoizer.run_dir = self.run_dir
+
         self._checkpoint_timer = None
         self.checkpoint_mode = config.checkpoint_mode
 
@@ -571,7 +569,7 @@ def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None:
         # Do we need to checkpoint now, or queue for later,
         # or do nothing?
         if self.checkpoint_mode == 'task_exit':
-            self.checkpoint(tasks=[task_record])
+            self.memoizer.checkpoint(tasks=[task_record])
         elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
             with self._modify_checkpointable_tasks_lock:
                 self.checkpointable_tasks.append(task_record)
@@ -1255,7 +1253,7 @@ def cleanup(self) -> None:
 
             # TODO: accesses to self.checkpointable_tasks should happen
             # under a lock?
-            self.checkpoint(self.checkpointable_tasks)
+            self.memoizer.checkpoint(self.checkpointable_tasks)
 
             if self._checkpoint_timer:
                 logger.info("Stopping checkpoint timer")
@@ -1330,73 +1328,9 @@ def cleanup(self) -> None:
 
     def invoke_checkpoint(self) -> None:
         with self._modify_checkpointable_tasks_lock:
-            self.checkpoint(self.checkpointable_tasks)
+            self.memoizer.checkpoint(self.checkpointable_tasks)
             self.checkpointable_tasks = []
 
-    def checkpoint(self, tasks: Sequence[TaskRecord]) -> None:
-        """Checkpoint the dfk incrementally to a checkpoint file.
-
-        When called, every task that has been completed yet not
-        checkpointed is checkpointed to a file.
-
-        Kwargs:
-            - tasks (List of task records) : List of task ids to checkpoint. Default=None
-                                         if set to None, we iterate over all tasks held by the DFK.
-
-        .. note::
-            Checkpointing only works if memoization is enabled
-
-        Returns:
-            Checkpoint dir if checkpoints were written successfully.
-            By default the checkpoints are written to the RUNDIR of the current
-            run under RUNDIR/checkpoints/{tasks.pkl, dfk.pkl}
-        """
-        with self.checkpoint_lock:
-            checkpoint_queue = tasks
-
-            checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
-            checkpoint_dfk = checkpoint_dir + '/dfk.pkl'
-            checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
-
-            if not os.path.exists(checkpoint_dir):
-                os.makedirs(checkpoint_dir, exist_ok=True)
-
-            with open(checkpoint_dfk, 'wb') as f:
-                state = {'rundir': self.run_dir,
-                         'task_count': self.task_count
-                         }
-                pickle.dump(state, f)
-
-            count = 0
-
-            with open(checkpoint_tasks, 'ab') as f:
-                for task_record in checkpoint_queue:
-                    task_id = task_record['id']
-
-                    app_fu = task_record['app_fu']
-
-                    if app_fu.done() and app_fu.exception() is None:
-                        hashsum = task_record['hashsum']
-                        if not hashsum:
-                            continue
-                        t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
-
-                        # We are using pickle here since pickle dumps to a file in 'ab'
-                        # mode behave like a incremental log.
-                        pickle.dump(t, f)
-                        count += 1
-                        logger.debug("Task {} checkpointed".format(task_id))
-
-            self.checkpointed_tasks += count
-
-            if count == 0:
-                if self.checkpointed_tasks == 0:
-                    logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
-                else:
-                    logger.debug("No tasks checkpointed in this pass.")
-            else:
-                logger.info("Done checkpointing {} tasks".format(count))
-
     @staticmethod
     def _log_std_streams(task_record: TaskRecord) -> None:
         tid = task_record['id']
diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py
index 1915b3535a..ff9d855306 100644
--- a/parsl/dataflow/memoization.py
+++ b/parsl/dataflow/memoization.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import pickle
+import threading
 from functools import lru_cache, singledispatch
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
 
@@ -150,6 +151,8 @@ class Memoizer:
 
     """
 
+    run_dir: str
+
     def __init__(self, dfk: DataFlowKernel, *, memoize: bool = True, checkpoint_files: Sequence[str]):
         """Initialize the memoizer.
 
@@ -163,6 +166,10 @@ def __init__(self, dfk: DataFlowKernel, *, memoize: bool = True, checkpoint_file
         self.dfk = dfk
         self.memoize = memoize
 
+        self.checkpointed_tasks = 0
+
+        self.checkpoint_lock = threading.Lock()
+
         # TODO: we always load checkpoints even if we then discard them...
         # this is more obvious here, less obvious in previous Parsl...
         checkpoint = self.load_checkpoints(checkpoint_files)
@@ -350,3 +357,69 @@ def load_checkpoints(self, checkpointDirs: Optional[Sequence[str]]) -> Dict[str,
             return self._load_checkpoints(checkpointDirs)
         else:
             return {}
+
+    def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
+        """Checkpoint the dfk incrementally to a checkpoint file.
+
+        When called, every task that has been completed yet not
+        checkpointed is checkpointed to a file.
+
+        Kwargs:
+            - tasks (List of task records) : List of task ids to checkpoint. Default=None
+                                         if set to None, we iterate over all tasks held by the DFK.
+
+        .. note::
+            Checkpointing only works if memoization is enabled
+
+        Returns:
+            Checkpoint dir if checkpoints were written successfully.
+            By default the checkpoints are written to the RUNDIR of the current
+            run under RUNDIR/checkpoints/{tasks.pkl, dfk.pkl}
+        """
+        with self.checkpoint_lock:
+            checkpoint_queue = tasks
+
+            checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
+            checkpoint_dfk = checkpoint_dir + '/dfk.pkl'
+            checkpoint_tasks = checkpoint_dir + '/tasks.pkl'
+
+            if not os.path.exists(checkpoint_dir):
+                os.makedirs(checkpoint_dir, exist_ok=True)
+
+            with open(checkpoint_dfk, 'wb') as f:
+                state = {'rundir': self.run_dir,
+                         # TODO: this isn't relevant to checkpointing? 'task_count': self.task_count
+                         }
+                pickle.dump(state, f)
+
+            count = 0
+
+            with open(checkpoint_tasks, 'ab') as f:
+                for task_record in checkpoint_queue:
+                    task_id = task_record['id']
+
+                    app_fu = task_record['app_fu']
+
+                    if app_fu.done() and app_fu.exception() is None:
+                        hashsum = task_record['hashsum']
+                        if not hashsum:
+                            continue
+                        t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
+
+                        # We are using pickle here since pickle dumps to a file in 'ab'
+                        # mode behave like a incremental log.
+                        pickle.dump(t, f)
+                        count += 1
+                        logger.debug("Task {} checkpointed".format(task_id))
+
+            self.checkpointed_tasks += count
+
+            if count == 0:
+                if self.checkpointed_tasks == 0:
+                    logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
+                else:
+                    logger.debug("No tasks checkpointed in this pass.")
+            else:
+                logger.info("Done checkpointing {} tasks".format(count))
+
+            return checkpoint_dir

From 8ba7557b4e2846257c0fa4d980d0f6f9d53be730 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Fri, 19 Jul 2024 17:22:05 +0000
Subject: [PATCH 07/13] make memoizer into an interface class and impls

---
 parsl/dataflow/dflow.py       |  4 ++--
 parsl/dataflow/memoization.py | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
index 1253b86a80..3ef0ef589f 100644
--- a/parsl/dataflow/dflow.py
+++ b/parsl/dataflow/dflow.py
@@ -31,7 +31,7 @@
 from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
 from parsl.dataflow.errors import DependencyError, JoinError
 from parsl.dataflow.futures import AppFuture
-from parsl.dataflow.memoization import Memoizer
+from parsl.dataflow.memoization import BasicMemoizer, Memoizer
 from parsl.dataflow.rundirs import make_rundir
 from parsl.dataflow.states import FINAL_FAILURE_STATES, FINAL_STATES, States
 from parsl.dataflow.taskrecord import TaskRecord
@@ -169,7 +169,7 @@ def __init__(self, config: Config) -> None:
         else:
             checkpoint_files = []
 
-        self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
+        self.memoizer: Memoizer = BasicMemoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
         self.memoizer.run_dir = self.run_dir
 
         self._checkpoint_timer = None
diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py
index ff9d855306..5324bf8164 100644
--- a/parsl/dataflow/memoization.py
+++ b/parsl/dataflow/memoization.py
@@ -121,6 +121,17 @@ def id_for_memo_function(f: types.FunctionType, output_ref: bool = False) -> byt
 
 
 class Memoizer:
+    def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
+        raise NotImplementedError
+
+    def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
+        raise NotImplementedError
+
+    def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
+        raise NotImplementedError
+
+
+class BasicMemoizer(Memoizer):
     """Memoizer is responsible for ensuring that identical work is not repeated.
 
     When a task is repeated, i.e., the same function is called with the same exact arguments, the

From 09eb41d4f1fdad055f16dab31b96c93e60136b0f Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Fri, 19 Jul 2024 17:29:12 +0000
Subject: [PATCH 08/13] configurable memoizer instance

---
 parsl/config.py                               |  3 ++
 parsl/dataflow/dflow.py                       | 11 +++-
 parsl/dataflow/memoization.py                 |  9 +++-
 .../test_python_apps/test_memoize_plugin.py   | 53 +++++++++++++++++++
 4 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 parsl/tests/test_python_apps/test_memoize_plugin.py

diff --git a/parsl/config.py b/parsl/config.py
index c3725eccf8..2121baedb5 100644
--- a/parsl/config.py
+++ b/parsl/config.py
@@ -5,6 +5,7 @@
 from typing_extensions import Literal
 
 from parsl.dataflow.dependency_resolvers import DependencyResolver
+from parsl.dataflow.memoization import Memoizer
 from parsl.dataflow.taskrecord import TaskRecord
 from parsl.errors import ConfigurationError
 from parsl.executors.base import ParslExecutor
@@ -98,6 +99,7 @@ class Config(RepresentationMixin, UsageInformation):
     def __init__(self,
                  executors: Optional[Iterable[ParslExecutor]] = None,
                  app_cache: bool = True,
+                 memoizer: Optional[Memoizer] = None,
                  checkpoint_files: Optional[Sequence[str]] = None,
                  checkpoint_mode: Union[None,
                                         Literal['task_exit'],
@@ -127,6 +129,7 @@ def __init__(self,
         self._executors: Sequence[ParslExecutor] = executors
         self._validate_executors()
 
+        self.memoizer = memoizer
         self.app_cache = app_cache
         self.checkpoint_files = checkpoint_files
         self.checkpoint_mode = checkpoint_mode
diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
index 3ef0ef589f..1b1d61ff9d 100644
--- a/parsl/dataflow/dflow.py
+++ b/parsl/dataflow/dflow.py
@@ -169,9 +169,16 @@ def __init__(self, config: Config) -> None:
         else:
             checkpoint_files = []
 
-        self.memoizer: Memoizer = BasicMemoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
-        self.memoizer.run_dir = self.run_dir
+        # self.memoizer: Memoizer = BasicMemoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
+        # the memoize flag might turn into the user choosing different instances
+        # of the Memoizer interface
+        self.memoizer: Memoizer
+        if config.memoizer is not None:
+            self.memoizer = config.memoizer
+        else:
+            self.memoizer = BasicMemoizer()
 
+        self.memoizer.start(dfk=self, memoize=config.app_cache, checkpoint_files=checkpoint_files, run_dir=self.run_dir)
         self._checkpoint_timer = None
         self.checkpoint_mode = config.checkpoint_mode
 
diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py
index 5324bf8164..865705440a 100644
--- a/parsl/dataflow/memoization.py
+++ b/parsl/dataflow/memoization.py
@@ -121,6 +121,9 @@ def id_for_memo_function(f: types.FunctionType, output_ref: bool = False) -> byt
 
 
 class Memoizer:
+    def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files: Sequence[str], run_dir: str) -> None:
+        raise NotImplementedError
+
     def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
         raise NotImplementedError
 
@@ -164,7 +167,10 @@ class BasicMemoizer(Memoizer):
 
     run_dir: str
 
-    def __init__(self, dfk: DataFlowKernel, *, memoize: bool = True, checkpoint_files: Sequence[str]):
+    def __init__(self) -> None:
+        pass
+
+    def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files: Sequence[str], run_dir: str) -> None:
         """Initialize the memoizer.
 
         Args:
@@ -176,6 +182,7 @@ def __init__(self, dfk: DataFlowKernel, *, memoize: bool = True, checkpoint_file
         """
         self.dfk = dfk
         self.memoize = memoize
+        self.run_dir = run_dir
 
         self.checkpointed_tasks = 0
 
diff --git a/parsl/tests/test_python_apps/test_memoize_plugin.py b/parsl/tests/test_python_apps/test_memoize_plugin.py
new file mode 100644
index 0000000000..724facf165
--- /dev/null
+++ b/parsl/tests/test_python_apps/test_memoize_plugin.py
@@ -0,0 +1,53 @@
+import argparse
+
+import pytest
+
+import parsl
+from parsl.app.app import python_app
+from parsl.config import Config
+from parsl.dataflow.memoization import BasicMemoizer
+from parsl.dataflow.taskrecord import TaskRecord
+
+
+class DontReuseSevenMemoizer(BasicMemoizer):
+    def check_memo(self, task_record: TaskRecord):
+        if task_record['args'][0] == 7:
+            return None  # we didn't find a suitable memo record...
+        else:
+            return super().check_memo(task_record)
+
+
+def local_config():
+    return Config(memoizer=DontReuseSevenMemoizer())
+
+
+@python_app(cache=True)
+def random_uuid(x, cache=True):
+    import uuid
+    return str(uuid.uuid4())
+
+
+@pytest.mark.local
+def test_python_memoization(n=10):
+    """Testing python memoization disable
+    """
+
+    # TODO: this .result() needs to be here, not in the loop
+    # because otherwise we race to complete... and then
+    # we might sometimes get a memoization before the loop
+    # and sometimes not...
+    x = random_uuid(0).result()
+
+    for i in range(0, n):
+        foo = random_uuid(0)
+        print(i)
+        print(foo.result())
+        assert foo.result() == x, "Memoized results were incorrectly not used"
+
+    y = random_uuid(7).result()
+
+    for i in range(0, n):
+        foo = random_uuid(7)
+        print(i)
+        print(foo.result())
+        assert foo.result() != y, "Memoized results were incorrectly used"

From 1745e7ef2b0dc99859a63ed29a6963b570dedda8 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Tue, 30 Jul 2024 19:36:05 +0000
Subject: [PATCH 09/13] checkpoint exceptions

---
 parsl/dataflow/memoization.py                 | 24 +++++--
 .../test_python_checkpoint_exceptions.py      | 66 +++++++++++++++++++
 2 files changed, 85 insertions(+), 5 deletions(-)
 create mode 100644 parsl/tests/test_checkpointing/test_python_checkpoint_exceptions.py

diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py
index 865705440a..4c511cdd61 100644
--- a/parsl/dataflow/memoization.py
+++ b/parsl/dataflow/memoization.py
@@ -11,6 +11,7 @@
 import typeguard
 
 from parsl.dataflow.errors import BadCheckpoint
+from parsl.dataflow.futures import AppFuture
 from parsl.dataflow.taskrecord import TaskRecord
 
 if TYPE_CHECKING:
@@ -336,8 +337,12 @@ def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[A
                             data = pickle.load(f)
                             # Copy and hash only the input attributes
                             memo_fu: Future = Future()
-                            assert data['exception'] is None
-                            memo_fu.set_result(data['result'])
+
+                            if data['exception'] is None:
+                                memo_fu.set_result(data['result'])
+                            else:
+                                assert data['result'] is None
+                                memo_fu.set_exception(data['exception'])
                             memo_lookup_table[data['hash']] = memo_fu
 
                         except EOFError:
@@ -418,17 +423,22 @@ def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
 
                     app_fu = task_record['app_fu']
 
-                    if app_fu.done() and app_fu.exception() is None:
+                    if app_fu.done() and self.filter_for_checkpoint(app_fu):
+
                         hashsum = task_record['hashsum']
                         if not hashsum:
                             continue
-                        t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
+
+                        if app_fu.exception() is None:
+                            t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
+                        else:
+                            t = {'hash': hashsum, 'exception': app_fu.exception(), 'result': None}
 
                         # We are using pickle here since pickle dumps to a file in 'ab'
                         # mode behave like a incremental log.
                         pickle.dump(t, f)
                         count += 1
-                        logger.debug("Task {} checkpointed".format(task_id))
+                        logger.debug("Task {} checkpointed as result".format(task_id))
 
             self.checkpointed_tasks += count
 
@@ -441,3 +451,7 @@ def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
                 logger.info("Done checkpointing {} tasks".format(count))
 
             return checkpoint_dir
+
+    def filter_for_checkpoint(self, app_fu: AppFuture) -> bool:
+        """Overridable method to decide if an entry should be checkpointed"""
+        return app_fu.exception() is None
diff --git a/parsl/tests/test_checkpointing/test_python_checkpoint_exceptions.py b/parsl/tests/test_checkpointing/test_python_checkpoint_exceptions.py
new file mode 100644
index 0000000000..1eca421562
--- /dev/null
+++ b/parsl/tests/test_checkpointing/test_python_checkpoint_exceptions.py
@@ -0,0 +1,66 @@
+import contextlib
+import os
+
+import pytest
+
+import parsl
+from parsl import python_app
+from parsl.config import Config
+from parsl.dataflow.memoization import BasicMemoizer
+from parsl.executors.threads import ThreadPoolExecutor
+
+
+class CheckpointExceptionsMemoizer(BasicMemoizer):
+    def filter_for_checkpoint(self, app_fu):
+        # checkpoint everything, rather than selecting only futures with
+        # results, not exceptions.
+
+        # task record is available from app_fu.task_record
+        assert app_fu.task_record is not None
+
+        return True
+
+
+def fresh_config():
+    return Config(
+        memoizer=CheckpointExceptionsMemoizer(),
+        executors=[
+            ThreadPoolExecutor(
+                label='local_threads_checkpoint',
+            )
+        ]
+    )
+
+
+@contextlib.contextmanager
+def parsl_configured(run_dir, **kw):
+    c = fresh_config()
+    c.run_dir = run_dir
+    for config_attr, config_val in kw.items():
+        setattr(c, config_attr, config_val)
+    dfk = parsl.load(c)
+    for ex in dfk.executors.values():
+        ex.working_dir = run_dir
+    yield dfk
+
+    parsl.dfk().cleanup()
+
+
+@python_app(cache=True)
+def uuid_app():
+    import uuid
+    raise RuntimeError(str(uuid.uuid4()))
+
+
+@pytest.mark.local
+def test_loading_checkpoint(tmpd_cwd):
+    """Load memoization table from previous checkpoint
+    """
+    with parsl_configured(tmpd_cwd, checkpoint_mode="task_exit"):
+        checkpoint_files = [os.path.join(parsl.dfk().run_dir, "checkpoint")]
+        result = uuid_app().exception()
+
+    with parsl_configured(tmpd_cwd, checkpoint_files=checkpoint_files):
+        relaunched = uuid_app().exception()
+
+    assert result.args == relaunched.args, "Expected following call to uuid_app to return cached uuid in exception"

From 333c7eb4ab9a2a0ca0c1b6a6c3f66d9af6e14e41 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Thu, 22 Aug 2024 09:33:57 +0000
Subject: [PATCH 10/13] add a todo on checkpoint policy position

---
 parsl/dataflow/dflow.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
index 1b1d61ff9d..bd62b1b9ea 100644
--- a/parsl/dataflow/dflow.py
+++ b/parsl/dataflow/dflow.py
@@ -162,6 +162,8 @@ def __init__(self, config: Config) -> None:
             self.monitoring.send(MessageType.WORKFLOW_INFO,
                                  workflow_info)
 
+        # TODO: this configuration should become part of the particular memoizer code
+        # - this is a checkpoint-implementation-specific parameter
         if config.checkpoint_files is not None:
             checkpoint_files = config.checkpoint_files
         elif config.checkpoint_files is None and config.checkpoint_mode is not None:
@@ -198,6 +200,10 @@ def __init__(self, config: Config) -> None:
         self.add_executors(config.executors)
         self.add_executors([parsl_internal_executor])
 
+        # TODO: these checkpoint modes should move into the memoizer implementation
+        # they're (probably?) checkpointer specific: for example the sqlite3-pure-memoizer
+        # doesn't have a notion of building up an in-memory checkpoint table that needs to be
+        # flushed on a separate policy
         if self.checkpoint_mode == "periodic":
             if config.checkpoint_period is None:
                 raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")

From e78e12db3ddf45243fb0dabfd121c2f8abf2157f Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Thu, 22 Aug 2024 13:19:46 +0000
Subject: [PATCH 11/13] make hash does not need to be part of basic memoizer
 and is more reusable when it isn't

this isn't the only way to make a hash though. and
hashing isn't the only way to compare checkpoint
entries for equality.
---
 parsl/dataflow/memoization.py | 73 ++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py
index 4c511cdd61..a50b411a9d 100644
--- a/parsl/dataflow/memoization.py
+++ b/parsl/dataflow/memoization.py
@@ -121,6 +121,42 @@ def id_for_memo_function(f: types.FunctionType, output_ref: bool = False) -> byt
     return pickle.dumps(["types.FunctionType", f.__name__, f.__module__])
 
 
+def make_hash(task: TaskRecord) -> str:
+    """Create a hash of the task inputs.
+
+    Args:
+        - task (dict) : Task dictionary from dfk.tasks
+
+    Returns:
+        - hash (str) : A unique hash string
+    """
+
+    t: List[bytes] = []
+
+    # if kwargs contains an outputs parameter, that parameter is removed
+    # and normalised differently - with output_ref set to True.
+    # kwargs listed in ignore_for_cache will also be removed
+
+    filtered_kw = task['kwargs'].copy()
+
+    ignore_list = task['ignore_for_cache']
+
+    logger.debug("Ignoring these kwargs for checkpointing: %s", ignore_list)
+    for k in ignore_list:
+        logger.debug("Ignoring kwarg %s", k)
+        del filtered_kw[k]
+
+    if 'outputs' in task['kwargs']:
+        outputs = task['kwargs']['outputs']
+        del filtered_kw['outputs']
+        t.append(id_for_memo(outputs, output_ref=True))
+
+    t.extend(map(id_for_memo, (filtered_kw, task['func'], task['args'])))
+
+    x = b''.join(t)
+    return hashlib.md5(x).hexdigest()
+
+
 class Memoizer:
     def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files: Sequence[str], run_dir: str) -> None:
         raise NotImplementedError
@@ -200,41 +236,6 @@ def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files:
             logger.info("App caching disabled for all apps")
             self.memo_lookup_table = {}
 
-    def make_hash(self, task: TaskRecord) -> str:
-        """Create a hash of the task inputs.
-
-        Args:
-            - task (dict) : Task dictionary from dfk.tasks
-
-        Returns:
-            - hash (str) : A unique hash string
-        """
-
-        t: List[bytes] = []
-
-        # if kwargs contains an outputs parameter, that parameter is removed
-        # and normalised differently - with output_ref set to True.
-        # kwargs listed in ignore_for_cache will also be removed
-
-        filtered_kw = task['kwargs'].copy()
-
-        ignore_list = task['ignore_for_cache']
-
-        logger.debug("Ignoring these kwargs for checkpointing: %s", ignore_list)
-        for k in ignore_list:
-            logger.debug("Ignoring kwarg %s", k)
-            del filtered_kw[k]
-
-        if 'outputs' in task['kwargs']:
-            outputs = task['kwargs']['outputs']
-            del filtered_kw['outputs']
-            t.append(id_for_memo(outputs, output_ref=True))
-
-        t.extend(map(id_for_memo, (filtered_kw, task['func'], task['args'])))
-
-        x = b''.join(t)
-        return hashlib.md5(x).hexdigest()
-
     def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
         """Create a hash of the task and its inputs and check the lookup table for this hash.
 
@@ -256,7 +257,7 @@ def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
             logger.debug("Task {} will not be memoized".format(task_id))
             return None
 
-        hashsum = self.make_hash(task)
+        hashsum = make_hash(task)
         logger.debug("Task {} has memoization hash {}".format(task_id, hashsum))
         result = None
         if hashsum in self.memo_lookup_table:

From 9ff13d7a1fdcc5f68c94d411f8e491e2d8a576a3 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Thu, 22 Aug 2024 13:38:44 +0000
Subject: [PATCH 12/13] close method for api

---
 parsl/dataflow/dflow.py       | 5 +++++
 parsl/dataflow/memoization.py | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/parsl/dataflow/dflow.py b/parsl/dataflow/dflow.py
index bd62b1b9ea..c6bd7fdedf 100644
--- a/parsl/dataflow/dflow.py
+++ b/parsl/dataflow/dflow.py
@@ -1260,6 +1260,7 @@ def cleanup(self) -> None:
 
         self.log_task_states()
 
+        # TODO: do this in the basic memoizer
         # Checkpointing takes priority over the rest of the tasks
         # checkpoint if any valid checkpoint method is specified
         if self.checkpoint_mode is not None:
@@ -1272,6 +1273,10 @@ def cleanup(self) -> None:
                 logger.info("Stopping checkpoint timer")
                 self._checkpoint_timer.close()
 
+        logger.info("Closing memoizer")
+        self.memoizer.close()
+        logger.info("Closed memoizer")
+
         # Send final stats
         logger.info("Sending end message for usage tracking")
         self.usage_tracker.send_end_message()
diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py
index a50b411a9d..0a5f541b9c 100644
--- a/parsl/dataflow/memoization.py
+++ b/parsl/dataflow/memoization.py
@@ -161,6 +161,9 @@ class Memoizer:
     def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files: Sequence[str], run_dir: str) -> None:
         raise NotImplementedError
 
+    def close(self) -> None:
+        raise NotImplementedError
+
     def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
         raise NotImplementedError
 
@@ -236,6 +239,9 @@ def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files:
             logger.info("App caching disabled for all apps")
             self.memo_lookup_table = {}
 
+    def close(self) -> None:
+        pass   # nothing to close but more should move here
+
     def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
         """Create a hash of the task and its inputs and check the lookup table for this hash.
 

From 4726c81d242a90af37a1abd363c9b1032e4151a2 Mon Sep 17 00:00:00 2001
From: Ben Clifford <benc@hawaga.org.uk>
Date: Thu, 22 Aug 2024 09:20:30 +0000
Subject: [PATCH 13/13] out-of-memory checkpointing

goal: results should not (never? in weak small cache?) be stored in an in-memory memo table. so that memo table should be not present in this implementation. instead all memo questions go to the sqlite3 database.

this drives some blurring between in-memory caching and disk-based checkpointing: the previous disk based checkpointed model relied on repopulating the in-memory memo table cache...

i hit some thread problems when using one sqlite3 connection across threads and the docs are unclear about what I can/cannot do, so i made this open the sqlite3 database on every access. that's probably got quite a performance hit, but its probably enough for basically validating the idea.
---
 parsl/dataflow/memoization.py                 |  20 ++-
 parsl/dataflow/memosql.py                     | 118 ++++++++++++++++++
 parsl/tests/configs/htex_local_alternate.py   |   4 +-
 .../test_python_checkpoint_2_sqlite.py        |  44 +++++++
 4 files changed, 174 insertions(+), 12 deletions(-)
 create mode 100644 parsl/dataflow/memosql.py
 create mode 100644 parsl/tests/test_checkpointing/test_python_checkpoint_2_sqlite.py

diff --git a/parsl/dataflow/memoization.py b/parsl/dataflow/memoization.py
index 0a5f541b9c..548950bf85 100644
--- a/parsl/dataflow/memoization.py
+++ b/parsl/dataflow/memoization.py
@@ -6,7 +6,7 @@
 import pickle
 import threading
 from functools import lru_cache, singledispatch
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence
 
 import typeguard
 
@@ -164,13 +164,13 @@ def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files:
     def close(self) -> None:
         raise NotImplementedError
 
-    def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
+    def update_memo(self, task: TaskRecord, r: Future) -> None:
         raise NotImplementedError
 
-    def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
+    def checkpoint(self, tasks: Sequence[TaskRecord]) -> None:
         raise NotImplementedError
 
-    def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
+    def check_memo(self, task: TaskRecord) -> Optional[Future]:
         raise NotImplementedError
 
 
@@ -242,7 +242,7 @@ def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files:
     def close(self) -> None:
         pass   # nothing to close but more should move here
 
-    def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
+    def check_memo(self, task: TaskRecord) -> Optional[Future]:
         """Create a hash of the task and its inputs and check the lookup table for this hash.
 
         If present, the results are returned.
@@ -277,7 +277,7 @@ def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
         assert isinstance(result, Future) or result is None
         return result
 
-    def hash_lookup(self, hashsum: str) -> Future[Any]:
+    def hash_lookup(self, hashsum: str) -> Future:
         """Lookup a hash in the memoization table.
 
         Args:
@@ -291,7 +291,7 @@ def hash_lookup(self, hashsum: str) -> Future[Any]:
         """
         return self.memo_lookup_table[hashsum]
 
-    def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
+    def update_memo(self, task: TaskRecord, r: Future) -> None:
         """Updates the memoization lookup table with the result from a task.
 
         Args:
@@ -316,7 +316,7 @@ def update_memo(self, task: TaskRecord, r: Future[Any]) -> None:
             logger.debug(f"Storing app cache entry {task['hashsum']} with result from task {task_id}")
         self.memo_lookup_table[task['hashsum']] = r
 
-    def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future[Any]]:
+    def _load_checkpoints(self, checkpointDirs: Sequence[str]) -> Dict[str, Future]:
         """Load a checkpoint file into a lookup table.
 
         The data being loaded from the pickle file mostly contains input
@@ -388,7 +388,7 @@ def load_checkpoints(self, checkpointDirs: Optional[Sequence[str]]) -> Dict[str,
         else:
             return {}
 
-    def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
+    def checkpoint(self, tasks: Sequence[TaskRecord]) -> None:
         """Checkpoint the dfk incrementally to a checkpoint file.
 
         When called, every task that has been completed yet not
@@ -457,8 +457,6 @@ def checkpoint(self, tasks: Sequence[TaskRecord]) -> str:
             else:
                 logger.info("Done checkpointing {} tasks".format(count))
 
-            return checkpoint_dir
-
     def filter_for_checkpoint(self, app_fu: AppFuture) -> bool:
         """Overridable method to decide if an entry should be checkpointed"""
         return app_fu.exception() is None
diff --git a/parsl/dataflow/memosql.py b/parsl/dataflow/memosql.py
new file mode 100644
index 0000000000..42bddd505a
--- /dev/null
+++ b/parsl/dataflow/memosql.py
@@ -0,0 +1,118 @@
+import logging
+import pickle
+import sqlite3
+from concurrent.futures import Future
+from pathlib import Path
+from typing import Optional, Sequence
+
+from parsl.dataflow.dflow import DataFlowKernel
+from parsl.dataflow.memoization import Memoizer, make_hash
+from parsl.dataflow.taskrecord import TaskRecord
+
+logger = logging.getLogger(__name__)
+
+
+class SQLiteMemoizer(Memoizer):
+    """Memoize out of memory into an sqlite3 database.
+
+    TODO: probably going to need some kind of shutdown now, to close
+    the sqlite3 connection.
+    which might also be useful for driving final checkpoints in the
+    original impl?
+    """
+
+    def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files: Sequence[str], run_dir: str) -> None:
+        """TODO: run_dir is the per-workflow run dir, but we need a broader checkpoint context... one level up
+        by default... get_all_checkpoints uses "runinfo/" as a relative path for that by default so replicating
+        that choice would do here. likewise I think for monitoring."""
+
+        self.db_path = Path(dfk.config.run_dir) / "checkpoint.sqlite3"
+        logger.debug("starting with db_path %r", self.db_path)
+
+        # TODO: api wart... turning memoization on or off should not be part of the plugin API
+        self.memoize = memoize
+
+        connection = sqlite3.connect(self.db_path)
+        cursor = connection.cursor()
+
+        cursor.execute("CREATE TABLE IF NOT EXISTS checkpoints(key, result)")
+        # probably want some index on key because that's what we're doing all the access via.
+
+        connection.commit()
+        connection.close()
+        logger.debug("checkpoint table created")
+
+    def close(self):
+        pass
+
+    def checkpoint(self, tasks: Sequence[TaskRecord]) -> None:
+        """All the behaviour for this memoizer is in check_memo and update_memo.
+        """
+        logger.debug("Explicit checkpoint call is a no-op with this memoizer")
+
+    def check_memo(self, task: TaskRecord) -> Optional[Future]:
+        """TODO: document this: check_memo is required to set the task hashsum,
+        if that's how we're going to key checkpoints in update_memo. (that's not
+        a requirement though: other equalities are available."""
+        task_id = task['id']
+
+        if not self.memoize or not task['memoize']:
+            task['hashsum'] = None
+            logger.debug("Task %s will not be memoized", task_id)
+            return None
+
+        hashsum = make_hash(task)
+        logger.debug("Task {} has memoization hash {}".format(task_id, hashsum))
+        task['hashsum'] = hashsum
+
+        connection = sqlite3.connect(self.db_path)
+        cursor = connection.cursor()
+        cursor.execute("SELECT result FROM checkpoints WHERE key = ?", (hashsum, ))
+        r = cursor.fetchone()
+
+        if r is None:
+            connection.close()
+            return None
+        else:
+            data = pickle.loads(r[0])
+            connection.close()
+
+            memo_fu: Future = Future()
+
+            if data['exception'] is None:
+                memo_fu.set_result(data['result'])
+            else:
+                assert data['result'] is None
+                memo_fu.set_exception(data['exception'])
+
+            return memo_fu
+
+    def update_memo(self, task: TaskRecord, r: Future) -> None:
+        logger.debug("updating memo")
+
+        if not self.memoize or not task['memoize'] or 'hashsum' not in task:
+            logger.debug("preconditions for memo not satisfied")
+            return
+
+        if not isinstance(task['hashsum'], str):
+            logger.error(f"Attempting to update app cache entry but hashsum is not a string key: {task['hashsum']}")
+            return
+
+        app_fu = task['app_fu']
+        hashsum = task['hashsum']
+
+        # this comes from the original concatenation-based checkpoint code:
+        if app_fu.exception() is None:
+            t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}
+        else:
+            t = {'hash': hashsum, 'exception': app_fu.exception(), 'result': None}
+
+        value = pickle.dumps(t)
+
+        connection = sqlite3.connect(self.db_path)
+        cursor = connection.cursor()
+
+        cursor.execute("INSERT INTO checkpoints VALUES(?, ?)", (hashsum, value))
+
+        connection.commit()
+        connection.close()
diff --git a/parsl/tests/configs/htex_local_alternate.py b/parsl/tests/configs/htex_local_alternate.py
index 52124211bc..91ad5589d1 100644
--- a/parsl/tests/configs/htex_local_alternate.py
+++ b/parsl/tests/configs/htex_local_alternate.py
@@ -23,6 +23,7 @@
 from parsl.data_provider.ftp import FTPInTaskStaging
 from parsl.data_provider.http import HTTPInTaskStaging
 from parsl.data_provider.zip import ZipFileStaging
+from parsl.dataflow.memosql import SQLiteMemoizer
 from parsl.executors import HighThroughputExecutor
 from parsl.launchers import SingleNodeLauncher
 
@@ -66,7 +67,8 @@ def fresh_config():
                         monitoring_debug=False,
                         resource_monitoring_interval=1,
         ),
-        usage_tracking=True
+        usage_tracking=True,
+        memoizer=SQLiteMemoizer()
     )
 
 
diff --git a/parsl/tests/test_checkpointing/test_python_checkpoint_2_sqlite.py b/parsl/tests/test_checkpointing/test_python_checkpoint_2_sqlite.py
new file mode 100644
index 0000000000..756dcad113
--- /dev/null
+++ b/parsl/tests/test_checkpointing/test_python_checkpoint_2_sqlite.py
@@ -0,0 +1,44 @@
+import contextlib
+import os
+
+import pytest
+
+import parsl
+from parsl import python_app
+from parsl.dataflow.memosql import SQLiteMemoizer
+from parsl.tests.configs.local_threads_checkpoint import fresh_config
+
+
+@contextlib.contextmanager
+def parsl_configured(run_dir, **kw):
+    c = fresh_config()
+    c.memoizer = SQLiteMemoizer()
+    c.run_dir = run_dir
+    for config_attr, config_val in kw.items():
+        setattr(c, config_attr, config_val)
+    dfk = parsl.load(c)
+    for ex in dfk.executors.values():
+        ex.working_dir = run_dir
+    yield dfk
+
+    parsl.dfk().cleanup()
+
+
+@python_app(cache=True)
+def uuid_app():
+    import uuid
+    return uuid.uuid4()
+
+
+@pytest.mark.local
+def test_loading_checkpoint(tmpd_cwd):
+    """Load memoization table from previous checkpoint
+    """
+    with parsl_configured(tmpd_cwd, checkpoint_mode="task_exit"):
+        checkpoint_files = [os.path.join(parsl.dfk().run_dir, "checkpoint")]
+        result = uuid_app().result()
+
+    with parsl_configured(tmpd_cwd, checkpoint_files=checkpoint_files):
+        relaunched = uuid_app().result()
+
+    assert result == relaunched, "Expected following call to uuid_app to return cached uuid"