Skip to content

Commit

Permalink
refactor recursive optimization for map operations (#225)
Browse files Browse the repository at this point in the history
* feat: add column view dialog

* feat: add column view dialog

* increase docker time

* feat: add feedback indicator in the row view

* feat: add prompt engineering flow for notes

* feat: add prompt engineering flow for notes

* feat: support resolve prompts in the prompt editor

* fix: fix build errors

* docs: update playground

* docs: update playground

* docs: update playground

* docs: update playground

* Update reduce folding instruction

* fix: make histogram calculation and rendering less blocking

* tests: make docker CI more robust

* docs: edit readme to be formatted better

* Edit system prompt intsruction

* Edit system prompt intsruction

* chore: refactor recursive optimization for map ops

* feat: small performance optimization to prompt improvement
  • Loading branch information
shreyashankar authored Dec 4, 2024
1 parent 8139461 commit bd40799
Show file tree
Hide file tree
Showing 8 changed files with 256 additions and 170 deletions.
8 changes: 5 additions & 3 deletions docetl/operations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,7 +779,7 @@ def _call_llm_with_cache(
parameters["required"] = list(props.keys())

# TODO: this is a hack to get around the fact that gemini doesn't support additionalProperties
if "gemini" not in model:
if "gemini" not in model and "claude" not in model:
parameters["additionalProperties"] = False

tools = [
Expand All @@ -788,12 +788,14 @@ def _call_llm_with_cache(
"function": {
"name": "send_output",
"description": "Send output back to the user",
"strict": True,
"parameters": parameters,
"additionalProperties": False,
},
}
]
if "claude" not in model:
tools[0]["additionalProperties"] = False
tools[0]["strict"] = True

tool_choice = {"type": "function", "function": {"name": "send_output"}}

elif tools is not None:
Expand Down
28 changes: 15 additions & 13 deletions docetl/optimizers/map_optimizer/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import time
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Callable, Dict, List, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple

from jinja2 import Template
from litellm import model_cost
Expand Down Expand Up @@ -47,6 +47,7 @@ def __init__(
run_operation: Callable,
timeout: int = 10,
is_filter: bool = False,
depth: int = 1,
):
"""
Initialize the MapOptimizer.
Expand All @@ -72,7 +73,7 @@ def __init__(
self.k_to_pairwise_compare = 6

self.plan_generator = PlanGenerator(
runner, llm_client, console, config, run_operation, max_threads, is_filter
runner, llm_client, console, config, run_operation, max_threads, is_filter, depth
)
self.evaluator = Evaluator(
llm_client,
Expand Down Expand Up @@ -206,7 +207,7 @@ def _should_optimize_helper(self, op_config: Dict[str, Any], input_data: List[Di


def optimize(
self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]], plan_types: Optional[List[str]] = ["chunk", "proj_synthesis", "glean"]
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
"""
Optimize the given operation configuration for the input data.
Expand Down Expand Up @@ -260,7 +261,7 @@ def optimize(
self.console.log(
f"[green]No improvement needed for operation {op_config['name']}[/green]"
)
return [op_config], output_data, self.plan_generator.reduce_optimizer_cost
return [op_config], output_data, self.plan_generator.subplan_optimizer_cost

candidate_plans = {}

Expand All @@ -274,15 +275,16 @@ def optimize(

# Generate chunk size plans
self.console.post_optimizer_status(StageType.CANDIDATE_PLANS)
self.console.log("[bold magenta]Generating chunking plans...[/bold magenta]")
chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
op_config, input_data, validator_prompt, model_input_context_length
)
for pname, plan in chunk_size_plans.items():
candidate_plans[pname] = plan
if "chunk" in plan_types:
self.console.log("[bold magenta]Generating chunking plans...[/bold magenta]")
chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
op_config, input_data, validator_prompt, model_input_context_length
)
for pname, plan in chunk_size_plans.items():
candidate_plans[pname] = plan

# Generate gleaning plans
if not data_exceeds_limit:
if not data_exceeds_limit and "glean" in plan_types:
self.console.log(
"[bold magenta]Generating gleaning plans...[/bold magenta]"
)
Expand All @@ -293,7 +295,7 @@ def optimize(
candidate_plans[pname] = plan

# Generate chain decomposition plans
if not data_exceeds_limit:
if not data_exceeds_limit and "proj_synthesis" in plan_types:
if not self.is_filter:
self.console.log(
"[bold magenta]Generating chain projection synthesis plans...[/bold magenta]"
Expand Down Expand Up @@ -465,5 +467,5 @@ def optimize(
return (
candidate_plans[best_plan_name],
best_output,
self.plan_generator.reduce_optimizer_cost,
self.plan_generator.subplan_optimizer_cost,
)
Loading

0 comments on commit bd40799

Please sign in to comment.