chore: cleaner ui for optimization and visualizing outputs

ucbepic · Nov 15, 2024 · 71b8bfe · 71b8bfe
1 parent 02e5274
commit 71b8bfe
Show file tree

Hide file tree

Showing 12 changed files with 473 additions and 72 deletions.
diff --git a/docetl/builder.py b/docetl/builder.py
@@ -475,7 +475,7 @@ def _load_optimized_ops(self):
         else:
             self.console.log("[yellow]No optimized operations found[/yellow]")
 
-    def should_optimize(self, step_name: str, op_name: str) -> bool:
+    def should_optimize(self, step_name: str, op_name: str) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]], float]:
         """
         Determine if an operation should be optimized.
         We do this by running the operations on a sample of the input data and checking if the output is correct.
@@ -509,6 +509,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool:
                 input_data = self._run_partial_step(
                     step, ops_run, sample_size, op_name_to_object
                 )
+                output_data = input_data
 
                 # If this is not the operation we want to optimize, just execute it and add to selectivities
                 if f"{step.get('name')}/{op_name}" != f"{step_name}/{op_name}" and op_object.get("empty", False):
@@ -530,7 +531,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool:
                             timeout=self.timeout,
                             is_filter=op_object.get("type") == "filter",
                         )
-                        should_optimize_output = map_optimizer.should_optimize(op_object, input_data)
+                        should_optimize_output, input_data, output_data = map_optimizer.should_optimize(op_object, input_data)
                     elif op_object.get("type") == "reduce":
                         reduce_optimizer = ReduceOptimizer(
                             self.runner,
@@ -540,7 +541,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool:
                             self.max_threads,
                             self._run_operation,
                         )
-                        should_optimize_output = reduce_optimizer.should_optimize(op_object, input_data)
+                        should_optimize_output, input_data, output_data = reduce_optimizer.should_optimize(op_object, input_data)
                     elif op_object.get("type") == "resolve":
                         resolve_optimizer = JoinOptimizer(
                             self.runner,
@@ -560,7 +561,7 @@ def should_optimize(self, step_name: str, op_name: str) -> bool:
                             continue
 
                     # Return the string and operation cost
-                    return should_optimize_output, self.operations_cost + self.llm_client.total_cost
+                    return should_optimize_output, input_data, output_data, self.operations_cost + self.llm_client.total_cost
 
         # Should not get here
         raise ValueError("No operation to optimize found")

diff --git a/docetl/dataset.py b/docetl/dataset.py
@@ -344,11 +344,7 @@ def sample(self, n: int, random: bool = True) -> List[Dict]:
                         )
                     sampled_data = rd.sample(data, n)
                 else:
-                    sampled_data = []
-                    for i, line in enumerate(f):
-                        if i >= n:
-                            break
-                        sampled_data.append(json.loads(line))
+                    return json.load(f)[:n]
 
         elif ext == ".csv":
             import csv

diff --git a/docetl/optimizers/map_optimizer/optimizer.py b/docetl/optimizers/map_optimizer/optimizer.py
@@ -86,7 +86,7 @@ def __init__(
             runner, llm_client, console, config, max_threads, is_filter
         )
 
-    def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> str:
+    def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
         """
         Determine if the given operation configuration should be optimized.
         """
@@ -95,9 +95,9 @@ def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str,
             assessment_str = "\n".join(assessment.get("reasons", [])) + "\n\nHere are some improvements that may help:\n" + "\n".join(assessment.get("improvements", []))
             if data_exceeds_limit:
                 assessment_str += "\nAlso, the input data exceeds the token limit."
-            return assessment_str
+            return assessment_str, input_data, output_data
         else:
-            return ""
+            return "", input_data, output_data
 
 
     def _should_optimize_helper(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], int, float, str, Dict[str, Any], bool]:

diff --git a/docetl/optimizers/reduce_optimizer.py b/docetl/optimizers/reduce_optimizer.py
@@ -124,20 +124,20 @@ def should_optimize_helper(
 
         return validation_results, prompt_tokens, model_input_context_length, model, validator_prompt, original_output
 
-    def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> str:
+    def should_optimize(self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
         validation_results, prompt_tokens, model_input_context_length, model, validator_prompt, original_output = self.should_optimize_helper(op_config, input_data)
         if prompt_tokens * 1.5 > model_input_context_length:
-            return "The reduce prompt is likely to exceed the token limit for model {model}."
+            return "The reduce prompt is likely to exceed the token limit for model {model}.", input_data, original_output
 
         if validation_results.get("needs_improvement", False):
             return "\n".join(
                 [
                     f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
                     for result in validation_results["validation_results"]
                 ]
-            )
+            ), input_data, original_output
         else:
-            return ""
+            return "", input_data, original_output
 
     def optimize(
         self,

diff --git a/docetl/runner.py b/docetl/runner.py
@@ -349,7 +349,7 @@ def execute_step(
 
             # If sample is set, sample the input data
             if op_object.get("sample"):
-                input_data = self.datasets[step["input"]].sample(op_object["sample"])
+                input_data = self.datasets[step["input"]].sample(op_object["sample"], False)
 
             with self.console.status("[bold]Running Operation:[/bold]") as status:
                 status.update(f"Type: [cyan]{op_object['type']}[/cyan]")
@@ -478,7 +478,7 @@ def _save_checkpoint(self, step_name: str, operation_name: str, data: List[Dict]
             f"[green]✓ [italic]Intermediate saved for operation '{operation_name}' in step '{step_name}' at {checkpoint_path}[/italic][/green]"
         )
 
-    def should_optimize(self, step_name: str, op_name: str, **kwargs) -> Tuple[str, float]:
+    def should_optimize(self, step_name: str, op_name: str, **kwargs) -> Tuple[str, float, List[Dict[str, Any]], List[Dict[str, Any]]]:
         builder = Optimizer(self, **kwargs)
         return builder.should_optimize(step_name, op_name)
 

diff --git a/server/app/routes/pipeline.py b/server/app/routes/pipeline.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 import uuid
 from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
 from server.app.models import PipelineRequest
@@ -30,6 +30,8 @@ class OptimizeResult(BaseModel):
     task_id: str
     status: TaskStatus
     should_optimize: Optional[str] = None
+    input_data: Optional[List[Dict[str, Any]]] = None
+    output_data: Optional[List[Dict[str, Any]]] = None
     cost: Optional[float] = None
     error: Optional[str] = None
     created_at: datetime
@@ -76,7 +78,7 @@ async def run_optimization(task_id: str, yaml_config: str, step_name: str, op_na
 
         # Run the actual optimization in a separate thread to not block
         runner = DSLRunner.from_yaml(yaml_config)
-        should_optimize, cost = await asyncio.to_thread(
+        should_optimize, input_data, output_data, cost = await asyncio.to_thread(
             runner.should_optimize,
             step_name,
             op_name
@@ -85,6 +87,8 @@ async def run_optimization(task_id: str, yaml_config: str, step_name: str, op_na
         # Update task result
         tasks[task_id].status = TaskStatus.COMPLETED
         tasks[task_id].should_optimize = should_optimize
+        tasks[task_id].input_data = input_data
+        tasks[task_id].output_data = output_data
         tasks[task_id].cost = cost
         tasks[task_id].completed_at = datetime.now()
 

diff --git a/website/src/app/types.ts b/website/src/app/types.ts
@@ -93,6 +93,8 @@ export interface OptimizeResult {
   task_id: string;
   status: TaskStatus;
   should_optimize?: string;
+  input_data?: Array<Record<string, unknown>>;
+  output_data?: Array<Record<string, unknown>>;
   cost?: number;
   error?: string;
   created_at: string;

diff --git a/website/src/components/BookmarkableText.tsx b/website/src/components/BookmarkableText.tsx
@@ -43,7 +43,7 @@ const formSchema = z.object({
 const BookmarkableText: React.FC<BookmarkableTextProps> = ({
   children,
   source,
-  className = "overflow-y-auto"
+  className = "overflow-y-auto",
 }) => {
   const [buttonPosition, setButtonPosition] = useState({ x: 0, y: 0 });
   const [showButton, setShowButton] = useState(false);
@@ -74,59 +74,58 @@ const BookmarkableText: React.FC<BookmarkableTextProps> = ({
     });
   };
 
+  // Listen for selection changes
   useEffect(() => {
-    const handleClickOutside = (event: MouseEvent) => {
-      //   if (
-      //     isPopoverOpen &&
-      //     popoverRef.current &&
-      //     !popoverRef.current.contains(event.target as Node) &&
-      //     buttonRef.current &&
-      //     !buttonRef.current.contains(event.target as Node)
-      //   ) {
-      //     setIsPopoverOpen(false);
-      //   }
+    const handleSelectionChange = () => {
+      if (isPopoverOpen) return;
+
+      const selection = window.getSelection();
+      if (!selection || selection.isCollapsed || !selection.toString().trim()) {
+        setShowButton(false);
+      }
     };
 
-    document.addEventListener("mousedown", handleClickOutside);
+    document.addEventListener("selectionchange", handleSelectionChange);
+    document.addEventListener("mousedown", handleSelectionChange);
+
     return () => {
-      document.removeEventListener("mousedown", handleClickOutside);
+      document.removeEventListener("selectionchange", handleSelectionChange);
+      document.removeEventListener("mousedown", handleSelectionChange);
     };
   }, [isPopoverOpen]);
 
   const handleMultiElementSelection = (
-    event: React.MouseEvent | React.TouchEvent,
+    event: React.MouseEvent | React.TouchEvent
   ) => {
-    event.stopPropagation();
+    if (isPopoverOpen) return;
+
     const selection = window.getSelection();
+    const text = selection?.toString().trim();
 
-    if (selection && !selection.isCollapsed) {
-      const range = selection.getRangeAt(0);
-      const fragment = range.cloneContents();
-      const tempDiv = document.createElement("div");
-      tempDiv.appendChild(fragment);
-      const text = tempDiv.innerText.trim();
-      if (text) {
-        form.setValue("editedText", text);
-        const rect = range.getBoundingClientRect();
-        setButtonPosition({
-          x: rect.left + rect.width / 2,
-          y: rect.top,
-        });
-        setShowButton(true);
-      } else {
-        // setShowButton(false);
-      }
-    } else {
-      // if (!isPopoverOpen) {
-      //     setShowButton(false);
-      // } else {
-      //     setShowButton(true);
-      // }
+    if (!selection || !text) {
+      setShowButton(false);
+      return;
     }
+
+    const range = selection.getRangeAt(0);
+    const rect = range.getBoundingClientRect();
+
+    form.setValue("editedText", text);
+    setButtonPosition({
+      x: rect.left + rect.width / 2,
+      y: rect.top,
+    });
+    setShowButton(true);
   };
 
   const handlePopoverOpenChange = (open: boolean) => {
     setIsPopoverOpen(open);
+    if (!open) {
+      const selection = window.getSelection();
+      if (!selection || selection.isCollapsed) {
+        setShowButton(false);
+      }
+    }
   };
 
   const handleClosePopover = () => {

diff --git a/website/src/components/OperationCard.tsx b/website/src/components/OperationCard.tsx
@@ -154,10 +154,10 @@ const OperationHeader: React.FC<{
                         <p className="text-sm">
                           {optimizeResult === undefined ||
                           optimizeResult === null
-                            ? "Computing whether optimization is needed..."
+                            ? "Determining whether to recommend a decomposition..."
                             : optimizeResult === ""
-                            ? "No optimization recommended"
-                            : "Optimization recommended because: " +
+                            ? "No decomposition recommended"
+                            : "Decomposition recommended because: " +
                               optimizeResult}
                         </p>
                       </TooltipContent>