Skip to content

Commit

Permalink
Merge pull request #186 from ucbepic/codeopsui
Browse files Browse the repository at this point in the history
chore: make output visualizations better
  • Loading branch information
shreyashankar authored Nov 15, 2024
2 parents 5b9a2a4 + 71e5e26 commit e922488
Show file tree
Hide file tree
Showing 12 changed files with 473 additions and 72 deletions.
9 changes: 5 additions & 4 deletions docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ def _load_optimized_ops(self):
else:
self.console.log("[yellow]No optimized operations found[/yellow]")

def should_optimize(self, step_name: str, op_name: str) -> bool:
def should_optimize(self, step_name: str, op_name: str) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]], float]:
"""
Determine if an operation should be optimized.
We do this by running the operations on a sample of the input data and checking if the output is correct.
Expand Down Expand Up @@ -509,6 +509,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool:
input_data = self._run_partial_step(
step, ops_run, sample_size, op_name_to_object
)
output_data = input_data

# If this is not the operation we want to optimize, just execute it and add to selectivities
if f"{step.get('name')}/{op_name}" != f"{step_name}/{op_name}" and op_object.get("empty", False):
Expand All @@ -530,7 +531,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool:
timeout=self.timeout,
is_filter=op_object.get("type") == "filter",
)
should_optimize_output = map_optimizer.should_optimize(op_object, input_data)
should_optimize_output, input_data, output_data = map_optimizer.should_optimize(op_object, input_data)
elif op_object.get("type") == "reduce":
reduce_optimizer = ReduceOptimizer(
self.runner,
Expand All @@ -540,7 +541,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool:
self.max_threads,
self._run_operation,
)
should_optimize_output = reduce_optimizer.should_optimize(op_object, input_data)
should_optimize_output, input_data, output_data = reduce_optimizer.should_optimize(op_object, input_data)
elif op_object.get("type") == "resolve":
resolve_optimizer = JoinOptimizer(
self.runner,
Expand All @@ -560,7 +561,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool:
continue

# Return the string and operation cost
return should_optimize_output, self.operations_cost + self.llm_client.total_cost
return should_optimize_output, input_data, output_data, self.operations_cost + self.llm_client.total_cost

# Should not get here
raise ValueError("No operation to optimize found")
Expand Down
6 changes: 1 addition & 5 deletions docetl/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,11 +344,7 @@ def sample(self, n: int, random: bool = True) -> List[Dict]:
)
sampled_data = rd.sample(data, n)
else:
sampled_data = []
for i, line in enumerate(f):
if i >= n:
break
sampled_data.append(json.loads(line))
return json.load(f)[:n]

elif ext == ".csv":
import csv
Expand Down
6 changes: 3 additions & 3 deletions docetl/optimizers/map_optimizer/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def __init__(
runner, llm_client, console, config, max_threads, is_filter
)

def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> str:
def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
"""
Determine if the given operation configuration should be optimized.
"""
Expand All @@ -95,9 +95,9 @@ def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str,
assessment_str = "\n".join(assessment.get("reasons", [])) + "\n\nHere are some improvements that may help:\n" + "\n".join(assessment.get("improvements", []))
if data_exceeds_limit:
assessment_str += "\nAlso, the input data exceeds the token limit."
return assessment_str
return assessment_str, input_data, output_data
else:
return ""
return "", input_data, output_data


def _should_optimize_helper(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], int, float, str, Dict[str, Any], bool]:
Expand Down
8 changes: 4 additions & 4 deletions docetl/optimizers/reduce_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,20 +124,20 @@ def should_optimize_helper(

return validation_results, prompt_tokens, model_input_context_length, model, validator_prompt, original_output

def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> str:
def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
validation_results, prompt_tokens, model_input_context_length, model, validator_prompt, original_output = self.should_optimize_helper(op_config, input_data)
if prompt_tokens * 1.5 > model_input_context_length:
return "The reduce prompt is likely to exceed the token limit for model {model}."
return "The reduce prompt is likely to exceed the token limit for model {model}.", input_data, original_output

if validation_results.get("needs_improvement", False):
return "\n".join(
[
f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
for result in validation_results["validation_results"]
]
)
), input_data, original_output
else:
return ""
return "", input_data, original_output

def optimize(
self,
Expand Down
4 changes: 2 additions & 2 deletions docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def execute_step(

# If sample is set, sample the input data
if op_object.get("sample"):
input_data = self.datasets[step["input"]].sample(op_object["sample"])
input_data = self.datasets[step["input"]].sample(op_object["sample"], False)

with self.console.status("[bold]Running Operation:[/bold]") as status:
status.update(f"Type: [cyan]{op_object['type']}[/cyan]")
Expand Down Expand Up @@ -478,7 +478,7 @@ def _save_checkpoint(self, step_name: str, operation_name: str, data: List[Dict]
f"[green]✓ [italic]Intermediate saved for operation '{operation_name}' in step '{step_name}' at {checkpoint_path}[/italic][/green]"
)

def should_optimize(self, step_name: str, op_name: str, **kwargs) -> Tuple[str, float]:
def should_optimize(self, step_name: str, op_name: str, **kwargs) -> Tuple[str, float, List[Dict[str, Any]], List[Dict[str, Any]]]:
builder = Optimizer(self, **kwargs)
return builder.should_optimize(step_name, op_name)

Expand Down
8 changes: 6 additions & 2 deletions server/app/routes/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
import uuid
from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
from server.app.models import PipelineRequest
Expand Down Expand Up @@ -30,6 +30,8 @@ class OptimizeResult(BaseModel):
task_id: str
status: TaskStatus
should_optimize: Optional[str] = None
input_data: Optional[List[Dict[str, Any]]] = None
output_data: Optional[List[Dict[str, Any]]] = None
cost: Optional[float] = None
error: Optional[str] = None
created_at: datetime
Expand Down Expand Up @@ -76,7 +78,7 @@ async def run_optimization(task_id: str, yaml_config: str, step_name: str, op_na

# Run the actual optimization in a separate thread to not block
runner = DSLRunner.from_yaml(yaml_config)
should_optimize, cost = await asyncio.to_thread(
should_optimize, input_data, output_data, cost = await asyncio.to_thread(
runner.should_optimize,
step_name,
op_name
Expand All @@ -85,6 +87,8 @@ async def run_optimization(task_id: str, yaml_config: str, step_name: str, op_na
# Update task result
tasks[task_id].status = TaskStatus.COMPLETED
tasks[task_id].should_optimize = should_optimize
tasks[task_id].input_data = input_data
tasks[task_id].output_data = output_data
tasks[task_id].cost = cost
tasks[task_id].completed_at = datetime.now()

Expand Down
2 changes: 2 additions & 0 deletions website/src/app/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ export interface OptimizeResult {
task_id: string;
status: TaskStatus;
should_optimize?: string;
input_data?: Array<Record<string, unknown>>;
output_data?: Array<Record<string, unknown>>;
cost?: number;
error?: string;
created_at: string;
Expand Down
75 changes: 37 additions & 38 deletions website/src/components/BookmarkableText.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ const formSchema = z.object({
const BookmarkableText: React.FC<BookmarkableTextProps> = ({
children,
source,
className = "overflow-y-auto"
className = "overflow-y-auto",
}) => {
const [buttonPosition, setButtonPosition] = useState({ x: 0, y: 0 });
const [showButton, setShowButton] = useState(false);
Expand Down Expand Up @@ -74,59 +74,58 @@ const BookmarkableText: React.FC<BookmarkableTextProps> = ({
});
};

// Listen for selection changes
useEffect(() => {
const handleClickOutside = (event: MouseEvent) => {
// if (
// isPopoverOpen &&
// popoverRef.current &&
// !popoverRef.current.contains(event.target as Node) &&
// buttonRef.current &&
// !buttonRef.current.contains(event.target as Node)
// ) {
// setIsPopoverOpen(false);
// }
const handleSelectionChange = () => {
if (isPopoverOpen) return;

const selection = window.getSelection();
if (!selection || selection.isCollapsed || !selection.toString().trim()) {
setShowButton(false);
}
};

document.addEventListener("mousedown", handleClickOutside);
document.addEventListener("selectionchange", handleSelectionChange);
document.addEventListener("mousedown", handleSelectionChange);

return () => {
document.removeEventListener("mousedown", handleClickOutside);
document.removeEventListener("selectionchange", handleSelectionChange);
document.removeEventListener("mousedown", handleSelectionChange);
};
}, [isPopoverOpen]);

const handleMultiElementSelection = (
event: React.MouseEvent | React.TouchEvent,
event: React.MouseEvent | React.TouchEvent
) => {
event.stopPropagation();
if (isPopoverOpen) return;

const selection = window.getSelection();
const text = selection?.toString().trim();

if (selection && !selection.isCollapsed) {
const range = selection.getRangeAt(0);
const fragment = range.cloneContents();
const tempDiv = document.createElement("div");
tempDiv.appendChild(fragment);
const text = tempDiv.innerText.trim();
if (text) {
form.setValue("editedText", text);
const rect = range.getBoundingClientRect();
setButtonPosition({
x: rect.left + rect.width / 2,
y: rect.top,
});
setShowButton(true);
} else {
// setShowButton(false);
}
} else {
// if (!isPopoverOpen) {
// setShowButton(false);
// } else {
// setShowButton(true);
// }
if (!selection || !text) {
setShowButton(false);
return;
}

const range = selection.getRangeAt(0);
const rect = range.getBoundingClientRect();

form.setValue("editedText", text);
setButtonPosition({
x: rect.left + rect.width / 2,
y: rect.top,
});
setShowButton(true);
};

const handlePopoverOpenChange = (open: boolean) => {
setIsPopoverOpen(open);
if (!open) {
const selection = window.getSelection();
if (!selection || selection.isCollapsed) {
setShowButton(false);
}
}
};

const handleClosePopover = () => {
Expand Down
6 changes: 3 additions & 3 deletions website/src/components/OperationCard.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,10 @@ const OperationHeader: React.FC<{
<p className="text-sm">
{optimizeResult === undefined ||
optimizeResult === null
? "Computing whether optimization is needed..."
? "Determining whether to recommend a decomposition..."
: optimizeResult === ""
? "No optimization recommended"
: "Optimization recommended because: " +
? "No decomposition recommended"
: "Decomposition recommended because: " +
optimizeResult}
</p>
</TooltipContent>
Expand Down
Loading

0 comments on commit e922488

Please sign in to comment.