From acb655c6fa10e07df2f33a859b35ee1caa60551b Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Sun, 8 Dec 2024 16:26:12 -0800 Subject: [PATCH] fix: adding sampling back (#236) * chore: update lockfile * feat: clean up UI to look a bit more consistent and smooth * feat: clean up upload dialog * feat: clean up upload dialog * feat: clean up upload dialog * feat: clean up upload dialog * feat: add themes * feat: fix tests * chore: update type ignore errors * fix: add sampling back (somehow i accidentally deleted it in my refactor) * fix: percentile calculation bug --- docetl/runner.py | 8 + website/src/app/playground/page.tsx | 4 +- website/src/components/ColumnDialog.tsx | 160 +++++++++++++++++- website/src/components/PipelineGui.tsx | 101 +++++++++-- website/src/components/ResizableDataTable.tsx | 9 +- website/src/contexts/ThemeContext.tsx | 28 +-- 6 files changed, 273 insertions(+), 37 deletions(-) diff --git a/docetl/runner.py b/docetl/runner.py index e02439d8..74a7c461 100644 --- a/docetl/runner.py +++ b/docetl/runner.py @@ -344,6 +344,14 @@ def execute_step( ) continue + # Delete existing intermediate file before running operation + if self.intermediate_dir: + checkpoint_path = os.path.join( + self.intermediate_dir, step["name"], f"{operation_name}.json" + ) + if os.path.exists(checkpoint_path): + os.remove(checkpoint_path) + op_object = self.find_operation(operation_name).copy() op_object.update(operation_config) diff --git a/website/src/app/playground/page.tsx b/website/src/app/playground/page.tsx index cd8f12d6..afdfcb0a 100644 --- a/website/src/app/playground/page.tsx +++ b/website/src/app/playground/page.tsx @@ -353,8 +353,8 @@ const CodeEditorPipelineApp: React.FC = () => { Forest - - Magestic + + Majestic Sunset diff --git a/website/src/components/ColumnDialog.tsx b/website/src/components/ColumnDialog.tsx index 64c32c26..5ef6be77 100644 --- a/website/src/components/ColumnDialog.tsx +++ b/website/src/components/ColumnDialog.tsx @@ -1,4 +1,4 @@ -import React, { useState, useEffect, useCallback } from "react"; +import React, { useState, useEffect, useCallback, useMemo } from "react"; import { Dialog, DialogContent } from "@/components/ui/dialog"; import { Button } from "@/components/ui/button"; import { @@ -38,6 +38,11 @@ import { TooltipTrigger, TooltipProvider, } from "@/components/ui/tooltip"; +import { ColumnStats } from "@/components/ResizableDataTable"; +import { + WordCountHistogram, + CategoricalBarChart, +} from "@/components/ResizableDataTable"; interface ObservabilityIndicatorProps { row: Record; @@ -97,8 +102,154 @@ export interface ColumnDialogProps> { onNavigate: (direction: "prev" | "next") => void; onJumpToRow: (index: number) => void; currentOperation: string; + columnStats: ColumnStats | null; } +function calculatePercentile(value: number, values: number[]): number { + if (values.length === 0) return 0; + + const sortedValues = [...values].sort((a, b) => a - b); + const index = sortedValues.findIndex((v) => v >= value); + + // If value is smaller than all values in the array + if (index === -1) return 100; + + // If value is larger than all values in the array + if (index === 0) return 0; + + // Calculate percentile ensuring it's between 0 and 100 + return Math.max( + 0, + Math.min(100, Math.round((index / sortedValues.length) * 100)) + ); +} + +interface ValueStatsProps { + value: unknown; + columnStats: ColumnStats | null; + data: Record[]; + columnId: string; +} + +const ValueStats = React.memo( + ({ value, columnStats, data, columnId }: ValueStatsProps) => { + if (!columnStats) return null; + + const currentValue = + typeof value === "number" + ? value + : typeof value === "string" + ? columnStats.type === "string-chars" + ? value.length + : value.split(/\s+/).length + : Array.isArray(value) + ? value.length + : typeof value === "boolean" + ? value + ? 1 + : 0 + : null; + + // Get all actual values from the data + const allValues = data + .map((row) => { + const val = row[columnId]; + if (val == null) return null; + if (typeof val === "number") return val; + if (typeof val === "string") + return columnStats.type === "string-chars" + ? val.length + : val.split(/\s+/).length; + if (Array.isArray(val)) return val.length; + if (typeof val === "boolean") return val ? 1 : 0; + return null; + }) + .filter((v): v is number => v !== null); + + const percentile = + currentValue !== null + ? calculatePercentile(currentValue, allValues) + : null; + + return ( +
+
+ {percentile !== null && ( +
+
+ {percentile} + th +
+
percentile
+
+ )} + +
+ {columnStats.isLowCardinality ? ( + + ) : ( + ({ + range: String( + Math.round(columnStats.min + i * columnStats.bucketSize) + ), + count, + fullRange: `${Math.round( + columnStats.min + i * columnStats.bucketSize + )} - ${Math.round( + columnStats.min + (i + 1) * columnStats.bucketSize + )}${ + columnStats.type === "array" + ? " items" + : columnStats.type === "string-chars" + ? " chars" + : columnStats.type === "string-words" + ? " words" + : "" + }`, + }))} + /> + )} +
+
+ +
+
+
Type
+
{columnStats.type}
+
+
+
Distinct Values
+
+ {columnStats.distinctCount} / {columnStats.totalCount} +
+
+
+
Current
+
+ {currentValue} + {columnStats.type === "array" + ? " items" + : columnStats.type === "string-chars" + ? " chars" + : columnStats.type === "string-words" + ? " words" + : ""} +
+
+
+
Range
+
+ {columnStats.min} - {columnStats.max} +
+
+
+
+ ); + } +); +ValueStats.displayName = "ValueStats"; + export function ColumnDialog>({ isOpen, onClose, @@ -109,6 +260,7 @@ export function ColumnDialog>({ onNavigate, onJumpToRow, currentOperation, + columnStats, }: ColumnDialogProps) { const [splitView, setSplitView] = useState(false); const [compareIndex, setCompareIndex] = useState(null); @@ -252,6 +404,12 @@ export function ColumnDialog>({
+
{renderContent(value)}
diff --git a/website/src/components/PipelineGui.tsx b/website/src/components/PipelineGui.tsx index e2d3ee52..db7ad39c 100644 --- a/website/src/components/PipelineGui.tsx +++ b/website/src/components/PipelineGui.tsx @@ -165,9 +165,6 @@ const PipelineGUI: React.FC = () => { const [tempAutoOptimizeCheck, setTempAutoOptimizeCheck] = useState(autoOptimizeCheck); const [tempOptimizerModel, setTempOptimizerModel] = useState(optimizerModel); - const [tempSampleSize, setTempSampleSize] = useState( - sampleSize?.toString() || "" - ); const [tempCurrentFile, setTempCurrentFile] = useState( currentFile ); @@ -377,12 +374,6 @@ const PipelineGUI: React.FC = () => { } }, [optimizerModel]); - useEffect(() => { - if (sampleSize) { - setTempSampleSize(sampleSize.toString()); - } - }, [sampleSize]); - const handleFileUpload = async ( event: React.ChangeEvent ) => { @@ -659,13 +650,6 @@ const PipelineGUI: React.FC = () => { const handleSettingsSave = () => { setPipelineName(tempPipelineName); - setSampleSize( - tempSampleSize === "" - ? null - : tempSampleSize === null - ? null - : parseInt(tempSampleSize, 10) - ); setCurrentFile(tempCurrentFile); setDefaultModel(tempDefaultModel); setIsSettingsOpen(false); @@ -906,6 +890,91 @@ const PipelineGUI: React.FC = () => { + + + + +
+ +
+
+ +

Set sample size for operations

+
+
+
+ + +
+
+ + + + + + +

Load from YAML

+
+
+
+ + + + + + + +

Save to YAML

+
+
+
+ +
diff --git a/website/src/components/ResizableDataTable.tsx b/website/src/components/ResizableDataTable.tsx index 2393abef..ff053d84 100644 --- a/website/src/components/ResizableDataTable.tsx +++ b/website/src/components/ResizableDataTable.tsx @@ -68,7 +68,7 @@ export type ColumnType = { id?: string; }; -interface ColumnStats { +export interface ColumnStats { min: number; max: number; avg: number; @@ -102,7 +102,7 @@ function calculateDistinctValueCounts( return valueCounts; } -function calculateColumnStats( +export function calculateColumnStats( data: Record[], accessor: string ): ColumnStats | null { @@ -256,7 +256,7 @@ const truncateString = (str: string, maxLength: number = 20) => { return str.slice(0, maxLength) + "..."; }; -const WordCountHistogram = memo( +export const WordCountHistogram = memo( ({ histogramData, }: { @@ -328,7 +328,7 @@ const WordCountHistogram = memo( ); WordCountHistogram.displayName = "WordCountHistogram"; -const CategoricalBarChart = memo( +export const CategoricalBarChart = memo( ({ data }: { data: { value: string; count: number }[] }) => { // Memoize total count calculation const totalCount = useMemo( @@ -1339,6 +1339,7 @@ export default function ResizableDataTable>({ }} onJumpToRow={(index) => setCurrentValueIndex(index)} currentOperation={currentOperation} + columnStats={columnStats[activeColumn]} /> )}
diff --git a/website/src/contexts/ThemeContext.tsx b/website/src/contexts/ThemeContext.tsx index 125a0440..fb95baf4 100644 --- a/website/src/contexts/ThemeContext.tsx +++ b/website/src/contexts/ThemeContext.tsx @@ -5,7 +5,7 @@ import React, { createContext, useContext, useEffect, useState } from "react"; export type Theme = | "default" | "forest" - | "magestic" + | "majestic" | "sunset" | "ruby" | "monochrome"; @@ -65,12 +65,12 @@ const themes = { input: "150 30% 18%", ring: "150 100% 50%", chart1: "150 70% 40%", - chart2: "120 65% 45%", - chart3: "180 60% 35%", - chart4: "90 55% 45%", - chart5: "165 70% 40%", + chart2: "35 85% 50%", + chart3: "195 65% 40%", + chart4: "105 60% 45%", + chart5: "270 45% 45%", }, - magestic: { + majestic: { background: "270 30% 99%", foreground: "270 10% 5%", card: "270 20% 97%", @@ -91,10 +91,10 @@ const themes = { input: "270 30% 18%", ring: "270 100% 50%", chart1: "270 70% 60%", - chart2: "290 65% 55%", - chart3: "250 60% 50%", - chart4: "310 70% 45%", - chart5: "230 65% 55%", + chart2: "330 65% 55%", + chart3: "210 60% 50%", + chart4: "30 70% 55%", + chart5: "150 55% 45%", }, sunset: { background: "30 30% 99%", @@ -143,10 +143,10 @@ const themes = { input: "345 30% 18%", ring: "345 100% 50%", chart1: "345 85% 55%", - chart2: "375 80% 50%", - chart3: "315 75% 45%", - chart4: "0 70% 50%", - chart5: "330 80% 55%", + chart2: "195 70% 50%", + chart3: "45 75% 55%", + chart4: "315 65% 45%", + chart5: "165 60% 50%", }, monochrome: { background: "0 0% 98%",