HITL for reduce decomposition

yogitha2023 · Sep 15, 2024 · bedecbc · bedecbc
1 parent 104a562
commit bedecbc
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 10 deletions.
diff --git a/docetl/builder.py b/docetl/builder.py
@@ -788,7 +788,7 @@ def _optimize_step(
                                 )
                             elif op_object.get("type") == "reduce":
                                 optimized_ops = self._optimize_reduce(
-                                    op_object, input_data
+                                    op_object, input_data, status
                                 )
                             elif op_object.get("type") == "resolve":
                                 optimized_ops = self._optimize_resolve(
@@ -1079,7 +1079,10 @@ def _get_reduce_sample(
         return sample
 
     def _optimize_reduce(
-        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        status: Status,
     ) -> List[Dict[str, Any]]:
         """
         Optimize a reduce operation.
@@ -1089,6 +1092,7 @@ def _optimize_reduce(
         Args:
             op_config (Dict[str, Any]): The configuration of the reduce operation.
             input_data (List[Dict[str, Any]]): The input data for the reduce operation.
+            status (Status): The status object to update with the progress of the optimization.
 
         Returns:
             List[Dict[str, Any]]: The optimized operation configuration.
@@ -1099,6 +1103,7 @@ def _optimize_reduce(
             self.llm_client,
             self.max_threads,
             self._run_operation,
+            status=status,
         )
         optimized_ops, _, cost = reduce_optimizer.optimize(op_config, input_data)
         self.operations_cost += cost

diff --git a/docetl/optimizers/join_optimizer.py b/docetl/optimizers/join_optimizer.py
@@ -320,19 +320,18 @@ def synthesize_resolution_prompt(
 
     Example structure:
     ```
-    Analyze the following duplicate entries:
+    Analyze the following duplicate entries for the {reduce_key} key:
 
     {{% for key in inputs %}}
     Entry {{{{ loop.index }}}}:
     {{{{ key | tojson }}}}
 
     {{% endfor %}}
 
-    Create a single, consolidated key that combines the information from all duplicate entries.
+    Create a single, consolidated key for {reduce_key} that combines the information from all duplicate entries.
     When merging, follow these guidelines:
     1. [Provide specific merging instructions relevant to the data type]
-    2. [Provide conflict resolution guidelines]
-    3. [Any other relevant instructions]
+    2. [Do not make the prompt too long]
 
     Ensure that the merged key conforms to the following schema:
     {json.dumps(output_schema, indent=2)}

diff --git a/docetl/optimizers/map_optimizer/plan_generators.py b/docetl/optimizers/map_optimizer/plan_generators.py
@@ -323,6 +323,10 @@ def task():
                         return plan_name, plan
 
                     # Optimize the reduce op for this current plan
+                    # TODO: enable this by default. it's just that
+                    # reduce drilldown decomposition is unreliable via the
+                    # agent, and I don't want users to have to confirm/interact
+                    # on every synthesized reduce op
                     try:
                         optimized_reduce_ops, _, cost = ReduceOptimizer(
                             self.config,

diff --git a/docetl/optimizers/reduce_optimizer.py b/docetl/optimizers/reduce_optimizer.py
@@ -4,9 +4,11 @@
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from statistics import mean, median
-from typing import Any, Callable, Dict, List, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from rich.console import Console
+from rich.prompt import Confirm
+from rich.status import Status
 
 from docetl.operations.base import BaseOperation
 from docetl.optimizers.join_optimizer import JoinOptimizer
@@ -43,6 +45,7 @@ def __init__(
         run_operation: Callable,
         num_fold_prompts: int = 1,
         num_samples_in_validation: int = 10,
+        status: Optional[Status] = None,
     ):
         """
         Initialize the ReduceOptimizer.
@@ -63,6 +66,7 @@ def __init__(
         self.max_threads = max_threads
         self.num_fold_prompts = num_fold_prompts
         self.num_samples_in_validation = num_samples_in_validation
+        self.status = status
 
     def optimize(
         self,
@@ -486,6 +490,33 @@ def _evaluate_decomposition(
         if not should_decompose["should_decompose"]:
             return should_decompose
 
+        # Temporarily stop the status
+        if self.status:
+            self.status.stop()
+
+        # Ask user if they agree with the decomposition assessment
+        user_agrees = Confirm.ask(
+            f"Do you agree with the decomposition assessment? "
+            f"[bold]{'Recommended' if should_decompose['should_decompose'] else 'Not recommended'}[/bold]"
+        )
+
+        # If user disagrees, invert the decomposition decision
+        if not user_agrees:
+            should_decompose["should_decompose"] = not should_decompose[
+                "should_decompose"
+            ]
+            should_decompose["explanation"] = (
+                "User disagreed with the initial assessment."
+            )
+
+        # Restart the status
+        if self.status:
+            self.status.start()
+
+        # Return if decomposition is not recommended
+        if not should_decompose["should_decompose"]:
+            return should_decompose
+
         decomposition_details = self._get_decomposition_details(op_config, input_data)
         result = {**should_decompose, **decomposition_details}
         if decomposition_details["sub_group_key"] in op_config["reduce_key"]:
@@ -1204,7 +1235,7 @@ def _create_reduce_plans(
         # Generate 6 candidate batch sizes
         batch_sizes = [
             max(1, int(max_batch_size * ratio))
-            for ratio in [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
+            for ratio in [0.1, 0.2, 0.4, 0.6, 0.75, 0.9]
         ]
         # Log the generated batch sizes
         self.console.log("[cyan]Generating plans for batch sizes:[/cyan]")
@@ -1225,7 +1256,7 @@ def _create_reduce_plans(
                     op_config,
                     sample_input,
                     sample_output,
-                    num_prompts=2,
+                    num_prompts=self.num_fold_prompts,
                 )
                 if not fold_prompts:
                     raise ValueError("No fold prompts generated")

diff --git a/todos.md b/todos.md
@@ -71,7 +71,7 @@ TODO:
 - [x] Support unnests in the optimizer
 - [x] Print out the name of the plan we are synthesizing
 - [x] Add gleaning plan to reduce
-- [ ] Reduce optimizer should get a human to confirm if a drill-down roll-up decomposition makes sense
+- [x] Reduce optimizer should get a human to confirm if a drill-down roll-up decomposition makes sense
 - [ ] Allow gleaning model to be different from the main op model
 - [ ] HITL for prompt selection (generally, a textual app)
 - [ ] Fix bug in recursively optimizing reduce in the map optimizer