Skip to content

Commit

Permalink
HITL for reduce decomposition
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyashankar committed Sep 15, 2024
1 parent 104a562 commit bedecbc
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 10 deletions.
9 changes: 7 additions & 2 deletions docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,7 +788,7 @@ def _optimize_step(
)
elif op_object.get("type") == "reduce":
optimized_ops = self._optimize_reduce(
op_object, input_data
op_object, input_data, status
)
elif op_object.get("type") == "resolve":
optimized_ops = self._optimize_resolve(
Expand Down Expand Up @@ -1079,7 +1079,10 @@ def _get_reduce_sample(
return sample

def _optimize_reduce(
self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
self,
op_config: Dict[str, Any],
input_data: List[Dict[str, Any]],
status: Status,
) -> List[Dict[str, Any]]:
"""
Optimize a reduce operation.
Expand All @@ -1089,6 +1092,7 @@ def _optimize_reduce(
Args:
op_config (Dict[str, Any]): The configuration of the reduce operation.
input_data (List[Dict[str, Any]]): The input data for the reduce operation.
status (Status): The status object to update with the progress of the optimization.
Returns:
List[Dict[str, Any]]: The optimized operation configuration.
Expand All @@ -1099,6 +1103,7 @@ def _optimize_reduce(
self.llm_client,
self.max_threads,
self._run_operation,
status=status,
)
optimized_ops, _, cost = reduce_optimizer.optimize(op_config, input_data)
self.operations_cost += cost
Expand Down
7 changes: 3 additions & 4 deletions docetl/optimizers/join_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,19 +320,18 @@ def synthesize_resolution_prompt(
Example structure:
```
Analyze the following duplicate entries:
Analyze the following duplicate entries for the {reduce_key} key:
{{% for key in inputs %}}
Entry {{{{ loop.index }}}}:
{{{{ key | tojson }}}}
{{% endfor %}}
Create a single, consolidated key that combines the information from all duplicate entries.
Create a single, consolidated key for {reduce_key} that combines the information from all duplicate entries.
When merging, follow these guidelines:
1. [Provide specific merging instructions relevant to the data type]
2. [Provide conflict resolution guidelines]
3. [Any other relevant instructions]
2. [Do not make the prompt too long]
Ensure that the merged key conforms to the following schema:
{json.dumps(output_schema, indent=2)}
Expand Down
4 changes: 4 additions & 0 deletions docetl/optimizers/map_optimizer/plan_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,10 @@ def task():
return plan_name, plan

# Optimize the reduce op for this current plan
# TODO: enable this by default. it's just that
# reduce drilldown decomposition is unreliable via the
# agent, and I don't want users to have to confirm/interact
# on every synthesized reduce op
try:
optimized_reduce_ops, _, cost = ReduceOptimizer(
self.config,
Expand Down
37 changes: 34 additions & 3 deletions docetl/optimizers/reduce_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from statistics import mean, median
from typing import Any, Callable, Dict, List, Tuple, Union
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from rich.console import Console
from rich.prompt import Confirm
from rich.status import Status

from docetl.operations.base import BaseOperation
from docetl.optimizers.join_optimizer import JoinOptimizer
Expand Down Expand Up @@ -43,6 +45,7 @@ def __init__(
run_operation: Callable,
num_fold_prompts: int = 1,
num_samples_in_validation: int = 10,
status: Optional[Status] = None,
):
"""
Initialize the ReduceOptimizer.
Expand All @@ -63,6 +66,7 @@ def __init__(
self.max_threads = max_threads
self.num_fold_prompts = num_fold_prompts
self.num_samples_in_validation = num_samples_in_validation
self.status = status

def optimize(
self,
Expand Down Expand Up @@ -486,6 +490,33 @@ def _evaluate_decomposition(
if not should_decompose["should_decompose"]:
return should_decompose

# Temporarily stop the status
if self.status:
self.status.stop()

# Ask user if they agree with the decomposition assessment
user_agrees = Confirm.ask(
f"Do you agree with the decomposition assessment? "
f"[bold]{'Recommended' if should_decompose['should_decompose'] else 'Not recommended'}[/bold]"
)

# If user disagrees, invert the decomposition decision
if not user_agrees:
should_decompose["should_decompose"] = not should_decompose[
"should_decompose"
]
should_decompose["explanation"] = (
"User disagreed with the initial assessment."
)

# Restart the status
if self.status:
self.status.start()

# Return if decomposition is not recommended
if not should_decompose["should_decompose"]:
return should_decompose

decomposition_details = self._get_decomposition_details(op_config, input_data)
result = {**should_decompose, **decomposition_details}
if decomposition_details["sub_group_key"] in op_config["reduce_key"]:
Expand Down Expand Up @@ -1204,7 +1235,7 @@ def _create_reduce_plans(
# Generate 6 candidate batch sizes
batch_sizes = [
max(1, int(max_batch_size * ratio))
for ratio in [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
for ratio in [0.1, 0.2, 0.4, 0.6, 0.75, 0.9]
]
# Log the generated batch sizes
self.console.log("[cyan]Generating plans for batch sizes:[/cyan]")
Expand All @@ -1225,7 +1256,7 @@ def _create_reduce_plans(
op_config,
sample_input,
sample_output,
num_prompts=2,
num_prompts=self.num_fold_prompts,
)
if not fold_prompts:
raise ValueError("No fold prompts generated")
Expand Down
2 changes: 1 addition & 1 deletion todos.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ TODO:
- [x] Support unnests in the optimizer
- [x] Print out the name of the plan we are synthesizing
- [x] Add gleaning plan to reduce
- [ ] Reduce optimizer should get a human to confirm if a drill-down roll-up decomposition makes sense
- [x] Reduce optimizer should get a human to confirm if a drill-down roll-up decomposition makes sense
- [ ] Allow gleaning model to be different from the main op model
- [ ] HITL for prompt selection (generally, a textual app)
- [ ] Fix bug in recursively optimizing reduce in the map optimizer
Expand Down

0 comments on commit bedecbc

Please sign in to comment.