Skip to content

Commit

Permalink
Tweaking documentation and commenting out value sampling decision in …
Browse files Browse the repository at this point in the history
…reduce optimizer
  • Loading branch information
shreyashankar committed Sep 15, 2024
1 parent 57e7c67 commit fe23642
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 11 deletions.
12 changes: 11 additions & 1 deletion docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def __init__(
self.operations_cost = 0
self.timeout = timeout
self.selectivities = defaultdict(dict)
self.samples_taken = defaultdict(dict)
self.resume = resume

home_dir = os.path.expanduser("~")
Expand Down Expand Up @@ -424,6 +425,13 @@ def _load_optimized_ops(self):
op["name"] for op in self.config["operations"]
]:
# Update the config with the optimized operations
# First, remove all operations that are already in the config with the same name
self.config["operations"] = [
op
for op in self.config["operations"]
if op["name"] != original_op_name
]

for op in optimized_ops:
op["optimize"] = False
self.config["operations"].append(op)
Expand Down Expand Up @@ -530,7 +538,7 @@ def optimize(self):
changed_op = False
for i, op_config in enumerate(self.optimized_config["operations"]):
if op_config["name"] == op:
self.optimized_config["operations"][i] = op
self.optimized_config["operations"][i] = step_operations[op]
changed_op = True
if not changed_op:
self.optimized_config["operations"].append(step_operations[op])
Expand Down Expand Up @@ -739,6 +747,7 @@ def _optimize_step(
selectivity = len(output_data) / len(input_data)

self.selectivities[step.get("name")][operation_name] = selectivity
self.samples_taken[step.get("name")][operation_name] = sample_size
else:
# Use rich console status to indicate optimization of the operation
with self.console.status(
Expand Down Expand Up @@ -842,6 +851,7 @@ def _optimize_step(
new_input_data_size = len(input_data)
selectivity = new_input_data_size / old_input_data_size
self.selectivities[step.get("name")][op_name] = selectivity
self.samples_taken[step.get("name")][op_name] = sample_size

# Set replacement_operations
replacement_operations[op_object["name"]] = [
Expand Down
4 changes: 3 additions & 1 deletion docetl/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ def build(
None, help="Maximum number of threads to use for running operations"
),
model: str = typer.Option("gpt-4o", help="Model to use for optimization"),
resume: bool = typer.Option(False, help="Resume optimization from a previous run"),
resume: bool = typer.Option(
False, help="Resume optimization from a previous build that may have failed"
),
timeout: int = typer.Option(
60, help="Timeout for optimization operations in seconds"
),
Expand Down
6 changes: 4 additions & 2 deletions docetl/operations/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,10 @@ def process_cluster(cluster):
# Calculate the number of records before and clusters after
num_records_before = len(input_data)
num_clusters_after = len(final_clusters)
self.console.log(f"Number of documents before: {num_records_before}")
self.console.log(f"Number of distinct documents after: {num_clusters_after}")
self.console.log(f"Number of keys before resolution: {num_records_before}")
self.console.log(
f"Number of distinct keys after resolution: {num_clusters_after}"
)

with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
futures = [
Expand Down
12 changes: 6 additions & 6 deletions docetl/optimizers/reduce_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,12 +286,12 @@ def _optimize_single_reduce(
Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing a single-item list with the optimized configuration
and a single-item list with the output from the optimized operation, and the cost of the operation due to synthesizing any resolve operations.
"""
# Step 1: Determine and configure value sampling
value_sampling_config = self._determine_value_sampling(op_config, input_data)
if value_sampling_config["enabled"]:
op_config["value_sampling"] = value_sampling_config
self.console.log("[bold]Value Sampling Configuration:[/bold]")
self.console.log(json.dumps(value_sampling_config, indent=2))
# Step 1: Determine and configure value sampling (TODO: re-enable this when the agent is more reliable)
# value_sampling_config = self._determine_value_sampling(op_config, input_data)
# if value_sampling_config["enabled"]:
# op_config["value_sampling"] = value_sampling_config
# self.console.log("[bold]Value Sampling Configuration:[/bold]")
# self.console.log(json.dumps(value_sampling_config, indent=2))

# Step 2: Determine if the reduce operation is associative
is_associative = self._is_associative(op_config, input_data)
Expand Down
32 changes: 31 additions & 1 deletion docs/execution/optimizing-pipelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ After creating your initial map-reduce pipeline, you might want to optimize it f

## Understanding the Optimizer

The optimizer in docetl finds optimal plans for operations marked with `optimize: True`. It can also insert resolve operations before reduce operations if needed. The optimizer uses GPT-4 under the hood (requiring an OpenAI API key) and can be customized with different models like gpt-4-turbo or gpt-4o-mini. Note that only LLM-powered operations can be optimized (e.g., `map`, `reduce`, `resolve`, `filter`, `equijoin`).
The optimizer in docetl finds optimal plans for operations marked with `optimize: True`. It can also insert resolve operations before reduce operations if needed. The optimizer uses GPT-4 under the hood (requiring an OpenAI API key) and can be customized with different models like gpt-4-turbo or gpt-4o-mini. Note that only LLM-powered operations can be optimized (e.g., `map`, `reduce`, `resolve`, `filter`, `equijoin`), but the optimized plans may involve new non-LLM operations (e.g., `split`).

At its core, the optimizer employs two types of AI agents: generation agents and validation agents. Generation agents work to rewrite operators into better plans, potentially decomposing a single operation into multiple, more efficient steps. Validation agents then evaluate these candidate plans, synthesizing task-specific validation prompts to compare outputs and determine the best plan for each operator.

Expand Down Expand Up @@ -33,6 +33,36 @@ graph LR

The optimization process can be unstable, as well as resource-intensive (we've seen it take up to 10 minutes to optimize a single operation, spending up to ~$50 in API costs for end-to-end pipelines). We recommend optimizing one operation at a time and retrying if necessary, as results may vary between runs. This approach also allows you to confidently verify that each optimized operation is performing as expected before moving on to the next. See the [API](#optimizer-api) for more details on how to resume the optimizer from a failed run, by rerunning `docetl build pipeline.yaml --resume` (with the `--resume` flag).


## Should I Use the Optimizer?

While any pipeline can potentially benefit from optimization, there are specific scenarios where using the optimizer can significantly improve your pipeline's performance and accuracy. When should you use the optimizer?

!!! info "Large Documents"

If you have documents that approach or exceed context limits and a map operation that transforms these documents using an LLM, the optimizer can help:

- Improve accuracy
- Enable processing of entire documents
- Optimize for large-scale data handling

!!! info "Entity Resolution"
The optimizer is particularly useful when:

- You need a resolve operation before your reduce operation
- You've defined a resolve operation but want to optimize it for speed using blocking

!!! info "High-Volume Reduce Operations"
Consider using the optimizer when:

- You have many documents feeding into a reduce operation for a given key
- You're concerned about the accuracy of the reduce operation due to high volume
- You want to optimize for better accuracy in complex reductions


Even if your pipeline doesn't fall into these specific categories, optimization can still be beneficial. For example, the optimizer can enhance your operations by adding gleaning to an operation, which uses an LLM-powered validator to ensure operation correctness. [Learn more about gleaning](../concepts/operators.md).


## Optimization Process

To optimize your pipeline, start with your initial configuration and follow these steps:
Expand Down
1 change: 1 addition & 0 deletions todos.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ TODO:
- [ ] HITL for prompt selection (generally, a textual app)
- [ ] Fix bug in recursively optimizing reduce in the map optimizer
- [x] Support reduce key of "all"
- [ ] Show estimated cost for the full pipeline after optimization
- [ ] Write tests for optimizers
- [ ] Refactor reduce and join optimizers
- [ ] Support prompt prefix/custom instructions so users don't have to put them in every operation
Expand Down

0 comments on commit fe23642

Please sign in to comment.