diff --git a/docetl/builder.py b/docetl/builder.py index 81853e4e..0072553e 100644 --- a/docetl/builder.py +++ b/docetl/builder.py @@ -475,7 +475,7 @@ def _load_optimized_ops(self): else: self.console.log("[yellow]No optimized operations found[/yellow]") - def should_optimize(self, step_name: str, op_name: str) -> bool: + def should_optimize(self, step_name: str, op_name: str) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]], float]: """ Determine if an operation should be optimized. We do this by running the operations on a sample of the input data and checking if the output is correct. @@ -509,6 +509,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool: input_data = self._run_partial_step( step, ops_run, sample_size, op_name_to_object ) + output_data = input_data # If this is not the operation we want to optimize, just execute it and add to selectivities if f"{step.get('name')}/{op_name}" != f"{step_name}/{op_name}" and op_object.get("empty", False): @@ -530,7 +531,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool: timeout=self.timeout, is_filter=op_object.get("type") == "filter", ) - should_optimize_output = map_optimizer.should_optimize(op_object, input_data) + should_optimize_output, input_data, output_data = map_optimizer.should_optimize(op_object, input_data) elif op_object.get("type") == "reduce": reduce_optimizer = ReduceOptimizer( self.runner, @@ -540,7 +541,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool: self.max_threads, self._run_operation, ) - should_optimize_output = reduce_optimizer.should_optimize(op_object, input_data) + should_optimize_output, input_data, output_data = reduce_optimizer.should_optimize(op_object, input_data) elif op_object.get("type") == "resolve": resolve_optimizer = JoinOptimizer( self.runner, @@ -560,7 +561,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool: continue # Return the string and operation cost - return should_optimize_output, self.operations_cost + self.llm_client.total_cost + return should_optimize_output, input_data, output_data, self.operations_cost + self.llm_client.total_cost # Should not get here raise ValueError("No operation to optimize found") diff --git a/docetl/dataset.py b/docetl/dataset.py index b72398c0..0e1e9e59 100644 --- a/docetl/dataset.py +++ b/docetl/dataset.py @@ -344,11 +344,7 @@ def sample(self, n: int, random: bool = True) -> List[Dict]: ) sampled_data = rd.sample(data, n) else: - sampled_data = [] - for i, line in enumerate(f): - if i >= n: - break - sampled_data.append(json.loads(line)) + return json.load(f)[:n] elif ext == ".csv": import csv diff --git a/docetl/optimizers/map_optimizer/optimizer.py b/docetl/optimizers/map_optimizer/optimizer.py index 2d6f8346..20758ccf 100644 --- a/docetl/optimizers/map_optimizer/optimizer.py +++ b/docetl/optimizers/map_optimizer/optimizer.py @@ -86,7 +86,7 @@ def __init__( runner, llm_client, console, config, max_threads, is_filter ) - def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> str: + def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]: """ Determine if the given operation configuration should be optimized. """ @@ -95,9 +95,9 @@ def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, assessment_str = "\n".join(assessment.get("reasons", [])) + "\n\nHere are some improvements that may help:\n" + "\n".join(assessment.get("improvements", [])) if data_exceeds_limit: assessment_str += "\nAlso, the input data exceeds the token limit." - return assessment_str + return assessment_str, input_data, output_data else: - return "" + return "", input_data, output_data def _should_optimize_helper(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], int, float, str, Dict[str, Any], bool]: diff --git a/docetl/optimizers/reduce_optimizer.py b/docetl/optimizers/reduce_optimizer.py index 15b86824..8685d40d 100644 --- a/docetl/optimizers/reduce_optimizer.py +++ b/docetl/optimizers/reduce_optimizer.py @@ -124,10 +124,10 @@ def should_optimize_helper( return validation_results, prompt_tokens, model_input_context_length, model, validator_prompt, original_output - def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> str: + def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]: validation_results, prompt_tokens, model_input_context_length, model, validator_prompt, original_output = self.should_optimize_helper(op_config, input_data) if prompt_tokens * 1.5 > model_input_context_length: - return "The reduce prompt is likely to exceed the token limit for model {model}." + return "The reduce prompt is likely to exceed the token limit for model {model}.", input_data, original_output if validation_results.get("needs_improvement", False): return "\n".join( @@ -135,9 +135,9 @@ def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, f"Issues: {result['issues']} Suggestions: {result['suggestions']}" for result in validation_results["validation_results"] ] - ) + ), input_data, original_output else: - return "" + return "", input_data, original_output def optimize( self, diff --git a/docetl/runner.py b/docetl/runner.py index cc3d7bce..13973e24 100644 --- a/docetl/runner.py +++ b/docetl/runner.py @@ -349,7 +349,7 @@ def execute_step( # If sample is set, sample the input data if op_object.get("sample"): - input_data = self.datasets[step["input"]].sample(op_object["sample"]) + input_data = self.datasets[step["input"]].sample(op_object["sample"], False) with self.console.status("[bold]Running Operation:[/bold]") as status: status.update(f"Type: [cyan]{op_object['type']}[/cyan]") @@ -478,7 +478,7 @@ def _save_checkpoint(self, step_name: str, operation_name: str, data: List[Dict] f"[green]✓ [italic]Intermediate saved for operation '{operation_name}' in step '{step_name}' at {checkpoint_path}[/italic][/green]" ) - def should_optimize(self, step_name: str, op_name: str, **kwargs) -> Tuple[str, float]: + def should_optimize(self, step_name: str, op_name: str, **kwargs) -> Tuple[str, float, List[Dict[str, Any]], List[Dict[str, Any]]]: builder = Optimizer(self, **kwargs) return builder.should_optimize(step_name, op_name) diff --git a/server/app/routes/pipeline.py b/server/app/routes/pipeline.py index 31ab7f5f..1db96c00 100644 --- a/server/app/routes/pipeline.py +++ b/server/app/routes/pipeline.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import uuid from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect from server.app.models import PipelineRequest @@ -30,6 +30,8 @@ class OptimizeResult(BaseModel): task_id: str status: TaskStatus should_optimize: Optional[str] = None + input_data: Optional[List[Dict[str, Any]]] = None + output_data: Optional[List[Dict[str, Any]]] = None cost: Optional[float] = None error: Optional[str] = None created_at: datetime @@ -76,7 +78,7 @@ async def run_optimization(task_id: str, yaml_config: str, step_name: str, op_na # Run the actual optimization in a separate thread to not block runner = DSLRunner.from_yaml(yaml_config) - should_optimize, cost = await asyncio.to_thread( + should_optimize, input_data, output_data, cost = await asyncio.to_thread( runner.should_optimize, step_name, op_name @@ -85,6 +87,8 @@ async def run_optimization(task_id: str, yaml_config: str, step_name: str, op_na # Update task result tasks[task_id].status = TaskStatus.COMPLETED tasks[task_id].should_optimize = should_optimize + tasks[task_id].input_data = input_data + tasks[task_id].output_data = output_data tasks[task_id].cost = cost tasks[task_id].completed_at = datetime.now() diff --git a/website/src/app/types.ts b/website/src/app/types.ts index eb56b6df..c13e0fb6 100644 --- a/website/src/app/types.ts +++ b/website/src/app/types.ts @@ -93,6 +93,8 @@ export interface OptimizeResult { task_id: string; status: TaskStatus; should_optimize?: string; + input_data?: Array>; + output_data?: Array>; cost?: number; error?: string; created_at: string; diff --git a/website/src/components/BookmarkableText.tsx b/website/src/components/BookmarkableText.tsx index 749c1b85..e87c8e93 100644 --- a/website/src/components/BookmarkableText.tsx +++ b/website/src/components/BookmarkableText.tsx @@ -43,7 +43,7 @@ const formSchema = z.object({ const BookmarkableText: React.FC = ({ children, source, - className = "overflow-y-auto" + className = "overflow-y-auto", }) => { const [buttonPosition, setButtonPosition] = useState({ x: 0, y: 0 }); const [showButton, setShowButton] = useState(false); @@ -74,59 +74,58 @@ const BookmarkableText: React.FC = ({ }); }; + // Listen for selection changes useEffect(() => { - const handleClickOutside = (event: MouseEvent) => { - // if ( - // isPopoverOpen && - // popoverRef.current && - // !popoverRef.current.contains(event.target as Node) && - // buttonRef.current && - // !buttonRef.current.contains(event.target as Node) - // ) { - // setIsPopoverOpen(false); - // } + const handleSelectionChange = () => { + if (isPopoverOpen) return; + + const selection = window.getSelection(); + if (!selection || selection.isCollapsed || !selection.toString().trim()) { + setShowButton(false); + } }; - document.addEventListener("mousedown", handleClickOutside); + document.addEventListener("selectionchange", handleSelectionChange); + document.addEventListener("mousedown", handleSelectionChange); + return () => { - document.removeEventListener("mousedown", handleClickOutside); + document.removeEventListener("selectionchange", handleSelectionChange); + document.removeEventListener("mousedown", handleSelectionChange); }; }, [isPopoverOpen]); const handleMultiElementSelection = ( - event: React.MouseEvent | React.TouchEvent, + event: React.MouseEvent | React.TouchEvent ) => { - event.stopPropagation(); + if (isPopoverOpen) return; + const selection = window.getSelection(); + const text = selection?.toString().trim(); - if (selection && !selection.isCollapsed) { - const range = selection.getRangeAt(0); - const fragment = range.cloneContents(); - const tempDiv = document.createElement("div"); - tempDiv.appendChild(fragment); - const text = tempDiv.innerText.trim(); - if (text) { - form.setValue("editedText", text); - const rect = range.getBoundingClientRect(); - setButtonPosition({ - x: rect.left + rect.width / 2, - y: rect.top, - }); - setShowButton(true); - } else { - // setShowButton(false); - } - } else { - // if (!isPopoverOpen) { - // setShowButton(false); - // } else { - // setShowButton(true); - // } + if (!selection || !text) { + setShowButton(false); + return; } + + const range = selection.getRangeAt(0); + const rect = range.getBoundingClientRect(); + + form.setValue("editedText", text); + setButtonPosition({ + x: rect.left + rect.width / 2, + y: rect.top, + }); + setShowButton(true); }; const handlePopoverOpenChange = (open: boolean) => { setIsPopoverOpen(open); + if (!open) { + const selection = window.getSelection(); + if (!selection || selection.isCollapsed) { + setShowButton(false); + } + } }; const handleClosePopover = () => { diff --git a/website/src/components/OperationCard.tsx b/website/src/components/OperationCard.tsx index 8c5fab0e..63a2e70c 100644 --- a/website/src/components/OperationCard.tsx +++ b/website/src/components/OperationCard.tsx @@ -154,10 +154,10 @@ const OperationHeader: React.FC<{

{optimizeResult === undefined || optimizeResult === null - ? "Computing whether optimization is needed..." + ? "Determining whether to recommend a decomposition..." : optimizeResult === "" - ? "No optimization recommended" - : "Optimization recommended because: " + + ? "No decomposition recommended" + : "Decomposition recommended because: " + optimizeResult}

diff --git a/website/src/components/OptimizationDialog.tsx b/website/src/components/OptimizationDialog.tsx new file mode 100644 index 00000000..3766d677 --- /dev/null +++ b/website/src/components/OptimizationDialog.tsx @@ -0,0 +1,194 @@ +import React from "react"; +import { + Dialog, + DialogContent, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from "@/components/ui/table"; +import { Button } from "@/components/ui/button"; +import { ChevronLeft, ChevronRight } from "lucide-react"; + +interface OptimizationDialogProps { + isOpen: boolean; + content: string; + prompt?: string; + operationName?: string; + inputData?: Array>; + outputData?: Array>; + onOpenChange: (open: boolean) => void; +} + +export const OptimizationDialog: React.FC = ({ + isOpen, + content, + prompt, + inputData, + outputData, + operationName, + onOpenChange, +}) => { + const [currentPage, setCurrentPage] = React.useState(1); + const rowsPerPage = 1; + + const shouldShowInputData = React.useMemo(() => { + if (!inputData?.length || !outputData?.length) return true; + + const inputKeys = new Set(Object.keys(inputData[0])); + const outputKeys = new Set(Object.keys(outputData[0])); + + return Array.from(inputKeys).some((key) => !outputKeys.has(key)); + }, [inputData, outputData]); + + const renderTable = (data: Array>) => { + if (!data.length) return null; + const columns = Object.keys(data[0]); + + const totalPages = Math.ceil(data.length / rowsPerPage); + const startIndex = (currentPage - 1) * rowsPerPage; + const paginatedData = data.slice(startIndex, startIndex + rowsPerPage); + + return ( +
+
+
+ + + + {columns.map((column) => ( + + {column} + + ))} + + + + {paginatedData.map((row, rowIndex) => ( + + {columns.map((column) => ( + +
+                          {typeof row[column] === "object"
+                            ? JSON.stringify(row[column], null, 2)
+                            : String(row[column])}
+                        
+
+ ))} +
+ ))} +
+
+
+
+ +
+
+ Row {startIndex + 1} of {data.length} +
+
+ + +
+
+
+ ); + }; + + React.useEffect(() => { + if (isOpen) { + setCurrentPage(1); + } + }, [isOpen]); + + return ( + + + + Decomposition Suggestions +

+ We've detected that the operation you're trying to run + might be too complex for the LLM. Consider breaking it down into + smaller operations. You can use our decomposition tool (lightning + button) to do this. +

+
+ +
+ {(operationName || prompt) && ( +
+ {operationName && ( +
+ + Operation: + + + {operationName} + +
+ )} + {prompt && ( +
+

+ Prompt: +

+
+ {prompt} +
+
+ )} +
+ )} + +
+ {shouldShowInputData && inputData && ( +
+

+ Sample Input Data +

+
{renderTable(inputData)}
+
+ )} + {outputData && ( +
+

+ Sample Output Data +

+
{renderTable(outputData)}
+
+ )} +
+ +
{content}
+
+
+
+ ); +}; + +OptimizationDialog.displayName = "OptimizationDialog"; diff --git a/website/src/components/PipelineGui.tsx b/website/src/components/PipelineGui.tsx index f9c89e45..54e9d992 100644 --- a/website/src/components/PipelineGui.tsx +++ b/website/src/components/PipelineGui.tsx @@ -58,6 +58,7 @@ import { useOptimizeCheck } from "@/hooks/useOptimizeCheck"; import { canBeOptimized } from "@/lib/utils"; import { Switch } from "./ui/switch"; import { Textarea } from "./ui/textarea"; +import { OptimizationDialog } from "@/components/OptimizationDialog"; const PipelineGUI: React.FC = () => { const fileInputRef = useRef(null); @@ -105,6 +106,19 @@ const PipelineGUI: React.FC = () => { const { toast } = useToast(); const { connect, sendMessage, lastMessage, readyState, disconnect } = useWebSocket(); + const [optimizationDialog, setOptimizationDialog] = useState<{ + isOpen: boolean; + content: string; + prompt?: string; + inputData?: Array>; + outputData?: Array>; + operationName?: string; + }>({ + isOpen: false, + content: "", + prompt: undefined, + operationName: undefined, + }); const { submitTask } = useOptimizeCheck({ onComplete: (result) => { @@ -117,14 +131,30 @@ const PipelineGUI: React.FC = () => { return newOps; }); setCost((prev) => prev + result.cost); - // Send toast with should optimize result - // If should_optimize string is not empty, send toast with should optimize result + if (result.should_optimize) { toast({ - title: `Hey! Consider optimizing ${ + title: `Hey! Consider decomposing ${ operations[operations.length - 1].name }`, - description: result.should_optimize, + description: ( + { + const lastOp = operations[operations.length - 1]; + setOptimizationDialog({ + isOpen: true, + content: result.should_optimize, + prompt: lastOp.prompt || "No prompt specified", + operationName: lastOp.name, + inputData: result.input_data, + outputData: result.output_data, + }); + }} + > + Click here to see why. + + ), duration: Infinity, }); } @@ -945,6 +975,17 @@ const PipelineGUI: React.FC = () => { + + setOptimizationDialog((prev) => ({ ...prev, isOpen: open })) + } + /> ); }; diff --git a/website/src/components/ResizableDataTable.tsx b/website/src/components/ResizableDataTable.tsx index fccc4f0f..ded94e9c 100644 --- a/website/src/components/ResizableDataTable.tsx +++ b/website/src/components/ResizableDataTable.tsx @@ -1,4 +1,10 @@ -import React, { useState, useEffect, useCallback, useMemo } from "react"; +import React, { + useState, + useEffect, + useCallback, + useMemo, + useRef, +} from "react"; import { flexRender, getCoreRowModel, @@ -40,6 +46,8 @@ import { TABLE_SETTINGS_KEY } from "@/app/localStorageKeys"; import ReactMarkdown from "react-markdown"; import debounce from "lodash/debounce"; import { BarChart, Bar, XAxis, Tooltip, ResponsiveContainer } from "recharts"; +import { Input } from "@/components/ui/input"; +import { Search } from "lucide-react"; export type DataType = Record; export type ColumnType = ColumnDef & { @@ -448,6 +456,163 @@ const MarkdownCell = React.memo(({ content }: MarkdownCellProps) => { }); MarkdownCell.displayName = "MarkdownCell"; +interface SearchableCellProps { + content: string; + isResizing: boolean; +} + +const SearchableCell = React.memo( + ({ content, isResizing }: SearchableCellProps) => { + const [searchTerm, setSearchTerm] = useState(""); + const [highlightedContent, setHighlightedContent] = useState(content); + const [currentMatchIndex, setCurrentMatchIndex] = useState(0); + const [matchCount, setMatchCount] = useState(0); + const containerRef = useRef(null); + + // Search functionality + useEffect(() => { + if (!searchTerm) { + setHighlightedContent(content); + setMatchCount(0); + setCurrentMatchIndex(0); + return; + } + + try { + const regex = new RegExp(`(${searchTerm})`, "gi"); + const matches = content.match(regex); + const matchesCount = matches ? matches.length : 0; + setMatchCount(matchesCount); + + if (matchesCount > 0) { + const highlighted = content.replace( + regex, + (match) => `${match}` + ); + setHighlightedContent(highlighted); + + // Scroll to current match + setTimeout(() => { + if (containerRef.current) { + const marks = + containerRef.current.getElementsByClassName("search-match"); + if (marks.length > 0 && currentMatchIndex < marks.length) { + marks[currentMatchIndex].scrollIntoView({ + behavior: "smooth", + block: "center", + }); + } + } + }, 100); + } else { + setHighlightedContent(content); + } + } catch { + setHighlightedContent(content); + setMatchCount(0); + } + }, [searchTerm, content, currentMatchIndex]); + + const navigateMatches = useCallback( + (direction: "next" | "prev") => { + if (matchCount === 0) return; + + if (direction === "next") { + setCurrentMatchIndex((prev) => (prev + 1) % matchCount); + } else { + setCurrentMatchIndex((prev) => (prev - 1 + matchCount) % matchCount); + } + }, + [matchCount] + ); + + // Style for search matches + useEffect(() => { + const styleId = "search-match-style"; + let styleElement = document.getElementById(styleId) as HTMLStyleElement; + + if (!styleElement) { + styleElement = document.createElement("style"); + styleElement.id = styleId; + document.head.appendChild(styleElement); + } + + styleElement.textContent = ` + mark.search-match { + background-color: hsl(var(--primary) / 0.2); + color: inherit; + padding: 0; + border-radius: 2px; + } + mark.search-match:nth-of-type(${currentMatchIndex + 1}) { + background-color: hsl(var(--primary) / 0.5); + } + `; + + return () => { + if (styleElement && styleElement.parentNode) { + styleElement.parentNode.removeChild(styleElement); + } + }; + }, [currentMatchIndex]); + + return ( +
+
+
+ + { + setSearchTerm(e.target.value); + setCurrentMatchIndex(0); + }} + className="h-6 text-xs border-none shadow-none focus-visible:ring-0" + /> + {matchCount > 0 && ( +
+ + + {currentMatchIndex + 1}/{matchCount} + + +
+ )} +
+
+
+ {isResizing ? ( +
{content}
+ ) : searchTerm ? ( +
+ ) : ( + + )} +
+
+ ); + } +); +SearchableCell.displayName = "SearchableCell"; + function ResizableDataTable({ data, columns, @@ -536,7 +701,7 @@ function ResizableDataTable({ columns: sortedColumns.map((col) => ({ ...col, enableSorting: true, - sortingFn: (rowA: any, rowB: any) => { + sortingFn: (rowA: Row, rowB: Row) => { const accessor = col.accessorKey; if (!accessor) return 0; @@ -721,11 +886,10 @@ function ResizableDataTable({ }} > {typeof cell.getValue() === "string" ? ( - isResizing ? ( -
{cell.getValue() as string}
- ) : ( - - ) + ) : ( flexRender( cell.column.columnDef.cell,