Skip to content

Commit

Permalink
Merge pull request #122 from ucbepic/shreya/smg
Browse files Browse the repository at this point in the history
fix: cache partial pipeline runs
  • Loading branch information
shreyashankar authored Oct 22, 2024
2 parents 1e6bba5 + b921d34 commit ebeb51d
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 17 deletions.
21 changes: 21 additions & 0 deletions docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,27 @@ def execute_step(
if self.intermediate_dir:
self._save_checkpoint(step["name"], operation_name, input_data)

# Load existing step op hash, if exists, merge self.step_op_hashes[step["name"]][operation_name] into it
# Save the step op hash
intermediate_config_path = os.path.join(
self.intermediate_dir, ".docetl_intermediate_config.json"
)
if os.path.exists(intermediate_config_path):
with open(intermediate_config_path, "r") as f:
existing_config = json.load(f)
else:
existing_config = {}

if step["name"] not in existing_config:
existing_config[step["name"]] = {}
existing_config[step["name"]][operation_name] = self.step_op_hashes[
step["name"]
][operation_name]

# Resave
with open(intermediate_config_path, "w") as f:
json.dump(existing_config, f, indent=2)

return input_data, total_cost

def _load_from_checkpoint_if_exists(
Expand Down
17 changes: 1 addition & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ python = "^3.10"
litellm = "^1.42.1"
tqdm = "^4.66.4"
rich = "^13.7.1"
tenacity = "^9.0.0"
frozendict = "^2.4.4"
diskcache = "^5.6.3"
typer = "^0.12.5"
Expand Down
73 changes: 73 additions & 0 deletions tests/test_runner_caching.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
import time
import pytest
import json
Expand Down Expand Up @@ -118,3 +119,75 @@ def test_pipeline_rerun_on_operation_change(

# Check that the runtime is faster when not modifying
assert unmodified_runtime < modified_runtime


# Test with an incorrect later operation but correct earlier operation
def test_partial_caching(temp_input_file, temp_output_file, temp_intermediate_dir):
# Create initial pipeline with two operations
initial_pipeline = Pipeline(
name="test_pipeline",
datasets={"test_input": Dataset(type="file", path=temp_input_file)},
operations=[
MapOp(
name="first_operation",
type="map",
prompt="Analyze the sentiment of the following text: '{{ input.text }}'",
output={"schema": {"sentiment": "string"}},
model="gpt-4o-mini",
),
MapOp(
name="second_operation_bad",
type="map",
prompt="Summarize the following text: '{{ forororororo }}'",
output={"schema": {"summary": "1000"}},
model="gpt-4o-mini",
),
],
steps=[
PipelineStep(
name="first_step",
input="test_input",
operations=["first_operation", "second_operation_bad"],
),
],
output=PipelineOutput(
type="file", path=temp_output_file, intermediate_dir=temp_intermediate_dir
),
default_model="gpt-4o-mini",
)

# Run the initial pipeline
# Run the initial pipeline with an expected error
with pytest.raises(Exception):
initial_cost = initial_pipeline.run()

new_pipeline_with_only_one_op = Pipeline(
name="test_pipeline",
datasets={"test_input": Dataset(type="file", path=temp_input_file)},
operations=[
MapOp(
name="first_operation",
type="map",
prompt="Analyze the sentiment of the following text: '{{ input.text }}'",
output={"schema": {"sentiment": "string"}},
model="gpt-4o-mini",
),
],
steps=[
PipelineStep(
name="first_step",
input="test_input",
operations=["first_operation"],
),
],
output=PipelineOutput(
type="file", path=temp_output_file, intermediate_dir=temp_intermediate_dir
),
default_model="gpt-4o-mini",
)
rerun_cost = new_pipeline_with_only_one_op.run()

# Assert that the cost was 0 when rerunning the pipeline
assert (
rerun_cost == 0
), "Expected zero cost when rerunning the pipeline without changes"

0 comments on commit ebeb51d

Please sign in to comment.