refactor recursive optimization for map operations (#225)

* feat: add column view dialog * feat: add column view dialog * increase docker time * feat: add feedback indicator in the row view * feat: add prompt engineering flow for notes * feat: add prompt engineering flow for notes * feat: support resolve prompts in the prompt editor * fix: fix build errors * docs: update playground * docs: update playground * docs: update playground * docs: update playground * Update reduce folding instruction * fix: make histogram calculation and rendering less blocking * tests: make docker CI more robust * docs: edit readme to be formatted better * Edit system prompt intsruction * Edit system prompt intsruction * chore: refactor recursive optimization for map ops * feat: small performance optimization to prompt improvement
ucbepic · Dec 4, 2024 · bd40799 · bd40799
1 parent 8139461
commit bd40799
Show file tree

Hide file tree

Showing 8 changed files with 256 additions and 170 deletions.
diff --git a/docetl/operations/utils.py b/docetl/operations/utils.py
@@ -779,7 +779,7 @@ def _call_llm_with_cache(
             parameters["required"] = list(props.keys())
 
             # TODO: this is a hack to get around the fact that gemini doesn't support additionalProperties
-            if "gemini" not in model:
+            if "gemini" not in model and "claude" not in model:
                 parameters["additionalProperties"] = False
 
             tools = [
@@ -788,12 +788,14 @@ def _call_llm_with_cache(
                     "function": {
                         "name": "send_output",
                         "description": "Send output back to the user",
-                        "strict": True,
                         "parameters": parameters,
-                        "additionalProperties": False,
                     },
                 }
             ]
+            if "claude" not in model:
+                tools[0]["additionalProperties"] = False
+                tools[0]["strict"] = True
+
             tool_choice = {"type": "function", "function": {"name": "send_output"}}
 
         elif tools is not None:

diff --git a/docetl/optimizers/map_optimizer/optimizer.py b/docetl/optimizers/map_optimizer/optimizer.py
@@ -3,7 +3,7 @@
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 from jinja2 import Template
 from litellm import model_cost
@@ -47,6 +47,7 @@ def __init__(
         run_operation: Callable,
         timeout: int = 10,
         is_filter: bool = False,
+        depth: int = 1,
     ):
         """
         Initialize the MapOptimizer.
@@ -72,7 +73,7 @@ def __init__(
         self.k_to_pairwise_compare = 6
 
         self.plan_generator = PlanGenerator(
-            runner, llm_client, console, config, run_operation, max_threads, is_filter
+            runner, llm_client, console, config, run_operation, max_threads, is_filter, depth
         )
         self.evaluator = Evaluator(
             llm_client,
@@ -206,7 +207,7 @@ def _should_optimize_helper(self, op_config: Dict[str, Any], input_data: List[Di
 
 
     def optimize(
-        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]], plan_types: Optional[List[str]] = ["chunk", "proj_synthesis", "glean"]
     ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
         """
         Optimize the given operation configuration for the input data.
@@ -260,7 +261,7 @@ def optimize(
                 self.console.log(
                     f"[green]No improvement needed for operation {op_config['name']}[/green]"
                 )
-                return [op_config], output_data, self.plan_generator.reduce_optimizer_cost
+                return [op_config], output_data, self.plan_generator.subplan_optimizer_cost
 
         candidate_plans = {}
 
@@ -274,15 +275,16 @@ def optimize(
 
         # Generate chunk size plans
         self.console.post_optimizer_status(StageType.CANDIDATE_PLANS)
-        self.console.log("[bold magenta]Generating chunking plans...[/bold magenta]")
-        chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
-            op_config, input_data, validator_prompt, model_input_context_length
-        )
-        for pname, plan in chunk_size_plans.items():
-            candidate_plans[pname] = plan
+        if "chunk" in plan_types:
+            self.console.log("[bold magenta]Generating chunking plans...[/bold magenta]")
+            chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
+                op_config, input_data, validator_prompt, model_input_context_length
+            )
+            for pname, plan in chunk_size_plans.items():
+                candidate_plans[pname] = plan
 
         # Generate gleaning plans
-        if not data_exceeds_limit:
+        if not data_exceeds_limit and "glean" in plan_types:
             self.console.log(
                 "[bold magenta]Generating gleaning plans...[/bold magenta]"
             )
@@ -293,7 +295,7 @@ def optimize(
                 candidate_plans[pname] = plan
 
         # Generate chain decomposition plans
-        if not data_exceeds_limit:
+        if not data_exceeds_limit and "proj_synthesis" in plan_types:
             if not self.is_filter:
                 self.console.log(
                     "[bold magenta]Generating chain projection synthesis plans...[/bold magenta]"
@@ -465,5 +467,5 @@ def optimize(
         return (
             candidate_plans[best_plan_name],
             best_output,
-            self.plan_generator.reduce_optimizer_cost,
+            self.plan_generator.subplan_optimizer_cost,
         )