Tweaking documentation and commenting out value sampling decision in …

…reduce optimizer
ucbepic · Sep 15, 2024 · fe23642 · fe23642
1 parent 57e7c67
commit fe23642
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 11 deletions.
diff --git a/docetl/builder.py b/docetl/builder.py
@@ -128,6 +128,7 @@ def __init__(
         self.operations_cost = 0
         self.timeout = timeout
         self.selectivities = defaultdict(dict)
+        self.samples_taken = defaultdict(dict)
         self.resume = resume
 
         home_dir = os.path.expanduser("~")
@@ -424,6 +425,13 @@ def _load_optimized_ops(self):
                         op["name"] for op in self.config["operations"]
                     ]:
                         # Update the config with the optimized operations
+                        # First, remove all operations that are already in the config with the same name
+                        self.config["operations"] = [
+                            op
+                            for op in self.config["operations"]
+                            if op["name"] != original_op_name
+                        ]
+
                         for op in optimized_ops:
                             op["optimize"] = False
                             self.config["operations"].append(op)
@@ -530,7 +538,7 @@ def optimize(self):
                 changed_op = False
                 for i, op_config in enumerate(self.optimized_config["operations"]):
                     if op_config["name"] == op:
-                        self.optimized_config["operations"][i] = op
+                        self.optimized_config["operations"][i] = step_operations[op]
                         changed_op = True
                 if not changed_op:
                     self.optimized_config["operations"].append(step_operations[op])
@@ -739,6 +747,7 @@ def _optimize_step(
                 selectivity = len(output_data) / len(input_data)
 
                 self.selectivities[step.get("name")][operation_name] = selectivity
+                self.samples_taken[step.get("name")][operation_name] = sample_size
             else:
                 # Use rich console status to indicate optimization of the operation
                 with self.console.status(
@@ -842,6 +851,7 @@ def _optimize_step(
                         new_input_data_size = len(input_data)
                         selectivity = new_input_data_size / old_input_data_size
                         self.selectivities[step.get("name")][op_name] = selectivity
+                        self.samples_taken[step.get("name")][op_name] = sample_size
 
                     # Set replacement_operations
                     replacement_operations[op_object["name"]] = [

diff --git a/docetl/cli.py b/docetl/cli.py
@@ -19,7 +19,9 @@ def build(
         None, help="Maximum number of threads to use for running operations"
     ),
     model: str = typer.Option("gpt-4o", help="Model to use for optimization"),
-    resume: bool = typer.Option(False, help="Resume optimization from a previous run"),
+    resume: bool = typer.Option(
+        False, help="Resume optimization from a previous build that may have failed"
+    ),
     timeout: int = typer.Option(
         60, help="Timeout for optimization operations in seconds"
     ),

diff --git a/docetl/operations/resolve.py b/docetl/operations/resolve.py
@@ -407,8 +407,10 @@ def process_cluster(cluster):
         # Calculate the number of records before and clusters after
         num_records_before = len(input_data)
         num_clusters_after = len(final_clusters)
-        self.console.log(f"Number of documents before: {num_records_before}")
-        self.console.log(f"Number of distinct documents after: {num_clusters_after}")
+        self.console.log(f"Number of keys before resolution: {num_records_before}")
+        self.console.log(
+            f"Number of distinct keys after resolution: {num_clusters_after}"
+        )
 
         with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
             futures = [

diff --git a/docetl/optimizers/reduce_optimizer.py b/docetl/optimizers/reduce_optimizer.py
@@ -286,12 +286,12 @@ def _optimize_single_reduce(
             Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing a single-item list with the optimized configuration
             and a single-item list with the output from the optimized operation, and the cost of the operation due to synthesizing any resolve operations.
         """
-        # Step 1: Determine and configure value sampling
-        value_sampling_config = self._determine_value_sampling(op_config, input_data)
-        if value_sampling_config["enabled"]:
-            op_config["value_sampling"] = value_sampling_config
-            self.console.log("[bold]Value Sampling Configuration:[/bold]")
-            self.console.log(json.dumps(value_sampling_config, indent=2))
+        # Step 1: Determine and configure value sampling (TODO: re-enable this when the agent is more reliable)
+        # value_sampling_config = self._determine_value_sampling(op_config, input_data)
+        # if value_sampling_config["enabled"]:
+        #     op_config["value_sampling"] = value_sampling_config
+        #     self.console.log("[bold]Value Sampling Configuration:[/bold]")
+        #     self.console.log(json.dumps(value_sampling_config, indent=2))
 
         # Step 2: Determine if the reduce operation is associative
         is_associative = self._is_associative(op_config, input_data)

diff --git a/docs/execution/optimizing-pipelines.md b/docs/execution/optimizing-pipelines.md
@@ -4,7 +4,7 @@ After creating your initial map-reduce pipeline, you might want to optimize it f
 
 ## Understanding the Optimizer
 
-The optimizer in docetl finds optimal plans for operations marked with `optimize: True`. It can also insert resolve operations before reduce operations if needed. The optimizer uses GPT-4 under the hood (requiring an OpenAI API key) and can be customized with different models like gpt-4-turbo or gpt-4o-mini. Note that only LLM-powered operations can be optimized (e.g., `map`, `reduce`, `resolve`, `filter`, `equijoin`).
+The optimizer in docetl finds optimal plans for operations marked with `optimize: True`. It can also insert resolve operations before reduce operations if needed. The optimizer uses GPT-4 under the hood (requiring an OpenAI API key) and can be customized with different models like gpt-4-turbo or gpt-4o-mini. Note that only LLM-powered operations can be optimized (e.g., `map`, `reduce`, `resolve`, `filter`, `equijoin`), but the optimized plans may involve new non-LLM operations (e.g., `split`).
 
 At its core, the optimizer employs two types of AI agents: generation agents and validation agents. Generation agents work to rewrite operators into better plans, potentially decomposing a single operation into multiple, more efficient steps. Validation agents then evaluate these candidate plans, synthesizing task-specific validation prompts to compare outputs and determine the best plan for each operator.
 
@@ -33,6 +33,36 @@ graph LR
 
     The optimization process can be unstable, as well as resource-intensive (we've seen it take up to 10 minutes to optimize a single operation, spending up to ~$50 in API costs for end-to-end pipelines). We recommend optimizing one operation at a time and retrying if necessary, as results may vary between runs. This approach also allows you to confidently verify that each optimized operation is performing as expected before moving on to the next. See the [API](#optimizer-api) for more details on how to resume the optimizer from a failed run, by rerunning `docetl build pipeline.yaml --resume` (with the `--resume` flag).
 
+
+## Should I Use the Optimizer?
+
+While any pipeline can potentially benefit from optimization, there are specific scenarios where using the optimizer can significantly improve your pipeline's performance and accuracy. When should you use the optimizer?
+
+!!! info "Large Documents"
+
+    If you have documents that approach or exceed context limits and a map operation that transforms these documents using an LLM, the optimizer can help:
+
+    - Improve accuracy
+    - Enable processing of entire documents
+    - Optimize for large-scale data handling
+
+!!! info "Entity Resolution"
+    The optimizer is particularly useful when:
+
+    - You need a resolve operation before your reduce operation
+    - You've defined a resolve operation but want to optimize it for speed using blocking
+
+!!! info "High-Volume Reduce Operations"
+    Consider using the optimizer when:
+
+    - You have many documents feeding into a reduce operation for a given key
+    - You're concerned about the accuracy of the reduce operation due to high volume
+    - You want to optimize for better accuracy in complex reductions
+
+
+Even if your pipeline doesn't fall into these specific categories, optimization can still be beneficial. For example, the optimizer can enhance your operations by adding gleaning to an operation, which uses an LLM-powered validator to ensure operation correctness. [Learn more about gleaning](../concepts/operators.md).
+
+
 ## Optimization Process
 
 To optimize your pipeline, start with your initial configuration and follow these steps:

diff --git a/todos.md b/todos.md
@@ -74,6 +74,7 @@ TODO:
 - [ ] HITL for prompt selection (generally, a textual app)
 - [ ] Fix bug in recursively optimizing reduce in the map optimizer
 - [x] Support reduce key of "all"
+- [ ] Show estimated cost for the full pipeline after optimization
 - [ ] Write tests for optimizers
 - [ ] Refactor reduce and join optimizers
 - [ ] Support prompt prefix/custom instructions so users don't have to put them in every operation