Skip to content

Commit

Permalink
fix: adding sampling back (#236)
Browse files Browse the repository at this point in the history
* chore: update lockfile

* feat: clean up UI to look a bit more consistent and smooth

* feat: clean up upload dialog

* feat: clean up upload dialog

* feat: clean up upload dialog

* feat: clean up upload dialog

* feat: add themes

* feat: fix tests

* chore: update type ignore errors

* fix: add sampling back (somehow i accidentally deleted it in my refactor)

* fix: percentile calculation bug
  • Loading branch information
shreyashankar authored Dec 9, 2024
1 parent e5b89c3 commit acb655c
Show file tree
Hide file tree
Showing 6 changed files with 273 additions and 37 deletions.
8 changes: 8 additions & 0 deletions docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,14 @@ def execute_step(
)
continue

# Delete existing intermediate file before running operation
if self.intermediate_dir:
checkpoint_path = os.path.join(
self.intermediate_dir, step["name"], f"{operation_name}.json"
)
if os.path.exists(checkpoint_path):
os.remove(checkpoint_path)

op_object = self.find_operation(operation_name).copy()
op_object.update(operation_config)

Expand Down
4 changes: 2 additions & 2 deletions website/src/app/playground/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,8 @@ const CodeEditorPipelineApp: React.FC = () => {
<MenubarRadioItem value="forest">
Forest
</MenubarRadioItem>
<MenubarRadioItem value="magestic">
Magestic
<MenubarRadioItem value="majestic">
Majestic
</MenubarRadioItem>
<MenubarRadioItem value="sunset">
Sunset
Expand Down
160 changes: 159 additions & 1 deletion website/src/components/ColumnDialog.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import React, { useState, useEffect, useCallback } from "react";
import React, { useState, useEffect, useCallback, useMemo } from "react";
import { Dialog, DialogContent } from "@/components/ui/dialog";
import { Button } from "@/components/ui/button";
import {
Expand Down Expand Up @@ -38,6 +38,11 @@ import {
TooltipTrigger,
TooltipProvider,
} from "@/components/ui/tooltip";
import { ColumnStats } from "@/components/ResizableDataTable";
import {
WordCountHistogram,
CategoricalBarChart,
} from "@/components/ResizableDataTable";

interface ObservabilityIndicatorProps {
row: Record<string, unknown>;
Expand Down Expand Up @@ -97,8 +102,154 @@ export interface ColumnDialogProps<T extends Record<string, unknown>> {
onNavigate: (direction: "prev" | "next") => void;
onJumpToRow: (index: number) => void;
currentOperation: string;
columnStats: ColumnStats | null;
}

function calculatePercentile(value: number, values: number[]): number {
if (values.length === 0) return 0;

const sortedValues = [...values].sort((a, b) => a - b);
const index = sortedValues.findIndex((v) => v >= value);

// If value is smaller than all values in the array
if (index === -1) return 100;

// If value is larger than all values in the array
if (index === 0) return 0;

// Calculate percentile ensuring it's between 0 and 100
return Math.max(
0,
Math.min(100, Math.round((index / sortedValues.length) * 100))
);
}

interface ValueStatsProps {
value: unknown;
columnStats: ColumnStats | null;
data: Record<string, unknown>[];
columnId: string;
}

const ValueStats = React.memo(
({ value, columnStats, data, columnId }: ValueStatsProps) => {
if (!columnStats) return null;

const currentValue =
typeof value === "number"
? value
: typeof value === "string"
? columnStats.type === "string-chars"
? value.length
: value.split(/\s+/).length
: Array.isArray(value)
? value.length
: typeof value === "boolean"
? value
? 1
: 0
: null;

// Get all actual values from the data
const allValues = data
.map((row) => {
const val = row[columnId];
if (val == null) return null;
if (typeof val === "number") return val;
if (typeof val === "string")
return columnStats.type === "string-chars"
? val.length
: val.split(/\s+/).length;
if (Array.isArray(val)) return val.length;
if (typeof val === "boolean") return val ? 1 : 0;
return null;
})
.filter((v): v is number => v !== null);

const percentile =
currentValue !== null
? calculatePercentile(currentValue, allValues)
: null;

return (
<div className="p-4 border-b bg-muted/5">
<div className="flex items-center gap-4 mb-2">
{percentile !== null && (
<div className="flex-none">
<div className="text-3xl font-bold text-primary">
{percentile}
<span className="text-lg">th</span>
</div>
<div className="text-xs text-muted-foreground">percentile</div>
</div>
)}

<div className="flex-1 h-12">
{columnStats.isLowCardinality ? (
<CategoricalBarChart data={columnStats.sortedValueCounts} />
) : (
<WordCountHistogram
histogramData={columnStats.distribution.map((count, i) => ({
range: String(
Math.round(columnStats.min + i * columnStats.bucketSize)
),
count,
fullRange: `${Math.round(
columnStats.min + i * columnStats.bucketSize
)} - ${Math.round(
columnStats.min + (i + 1) * columnStats.bucketSize
)}${
columnStats.type === "array"
? " items"
: columnStats.type === "string-chars"
? " chars"
: columnStats.type === "string-words"
? " words"
: ""
}`,
}))}
/>
)}
</div>
</div>

<div className="grid grid-cols-4 gap-2 text-xs">
<div className="space-y-0.5">
<div className="font-medium">Type</div>
<div className="text-muted-foreground">{columnStats.type}</div>
</div>
<div className="space-y-0.5">
<div className="font-medium">Distinct Values</div>
<div className="text-muted-foreground">
{columnStats.distinctCount} / {columnStats.totalCount}
</div>
</div>
<div className="space-y-0.5">
<div className="font-medium">Current</div>
<div className="text-muted-foreground">
{currentValue}
{columnStats.type === "array"
? " items"
: columnStats.type === "string-chars"
? " chars"
: columnStats.type === "string-words"
? " words"
: ""}
</div>
</div>
<div className="space-y-0.5">
<div className="font-medium">Range</div>
<div className="text-muted-foreground">
{columnStats.min} - {columnStats.max}
</div>
</div>
</div>
</div>
);
}
);
ValueStats.displayName = "ValueStats";

export function ColumnDialog<T extends Record<string, unknown>>({
isOpen,
onClose,
Expand All @@ -109,6 +260,7 @@ export function ColumnDialog<T extends Record<string, unknown>>({
onNavigate,
onJumpToRow,
currentOperation,
columnStats,
}: ColumnDialogProps<T>) {
const [splitView, setSplitView] = useState(false);
const [compareIndex, setCompareIndex] = useState<number | null>(null);
Expand Down Expand Up @@ -252,6 +404,12 @@ export function ColumnDialog<T extends Record<string, unknown>>({
</Tooltip>

<div className="flex-1 overflow-auto">
<ValueStats
value={value}
columnStats={columnStats}
data={data}
columnId={columnId}
/>
<div className="px-4 py-2">{renderContent(value)}</div>
</div>

Expand Down
101 changes: 85 additions & 16 deletions website/src/components/PipelineGui.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,6 @@ const PipelineGUI: React.FC = () => {
const [tempAutoOptimizeCheck, setTempAutoOptimizeCheck] =
useState(autoOptimizeCheck);
const [tempOptimizerModel, setTempOptimizerModel] = useState(optimizerModel);
const [tempSampleSize, setTempSampleSize] = useState(
sampleSize?.toString() || ""
);
const [tempCurrentFile, setTempCurrentFile] = useState<File | null>(
currentFile
);
Expand Down Expand Up @@ -377,12 +374,6 @@ const PipelineGUI: React.FC = () => {
}
}, [optimizerModel]);

useEffect(() => {
if (sampleSize) {
setTempSampleSize(sampleSize.toString());
}
}, [sampleSize]);

const handleFileUpload = async (
event: React.ChangeEvent<HTMLInputElement>
) => {
Expand Down Expand Up @@ -659,13 +650,6 @@ const PipelineGUI: React.FC = () => {

const handleSettingsSave = () => {
setPipelineName(tempPipelineName);
setSampleSize(
tempSampleSize === ""
? null
: tempSampleSize === null
? null
: parseInt(tempSampleSize, 10)
);
setCurrentFile(tempCurrentFile);
setDefaultModel(tempDefaultModel);
setIsSettingsOpen(false);
Expand Down Expand Up @@ -906,6 +890,91 @@ const PipelineGUI: React.FC = () => {
</div>
</PopoverContent>
</Popover>

<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<div className="flex items-center">
<Button
variant="outline"
size="sm"
className="h-8 px-2 flex items-center gap-2"
>
<PieChart size={14} />
<Input
type="number"
value={sampleSize || ""}
onChange={(e) => {
const value = e.target.value;
setSampleSize(
value === "" ? null : parseInt(value, 10)
);
}}
className="w-16 h-6 text-xs border-0 p-0 focus-visible:ring-0"
placeholder="All docs"
/>
</Button>
</div>
</TooltipTrigger>
<TooltipContent>
<p>Set sample size for operations</p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
</div>

<div className="flex items-center border-l pl-2">
<div className="flex items-center space-x-1">
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Button
variant="ghost"
size="icon"
onClick={() => fileInputRef.current?.click()}
className="h-8 w-8"
>
<FileUp size={16} />
</Button>
</TooltipTrigger>
<TooltipContent>
<p>Load from YAML</p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
<Input
type="file"
ref={fileInputRef}
onChange={handleFileUpload}
accept=".yaml,.yml"
className="hidden"
/>
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Button
size="icon"
variant="ghost"
onClick={() => handleExport()}
className="h-8 w-8"
>
<Download size={16} />
</Button>
</TooltipTrigger>
<TooltipContent>
<p>Save to YAML</p>
</TooltipContent>
</Tooltip>
</TooltipProvider>
<Button
size="icon"
variant="ghost"
onClick={() => setIsSettingsOpen(true)}
className="h-8 w-8"
>
<Settings size={16} />
</Button>
</div>
</div>

<div className="flex items-center border-l pl-2">
Expand Down
9 changes: 5 additions & 4 deletions website/src/components/ResizableDataTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ export type ColumnType<T> = {
id?: string;
};

interface ColumnStats {
export interface ColumnStats {
min: number;
max: number;
avg: number;
Expand Down Expand Up @@ -102,7 +102,7 @@ function calculateDistinctValueCounts(
return valueCounts;
}

function calculateColumnStats(
export function calculateColumnStats(
data: Record<string, unknown>[],
accessor: string
): ColumnStats | null {
Expand Down Expand Up @@ -256,7 +256,7 @@ const truncateString = (str: string, maxLength: number = 20) => {
return str.slice(0, maxLength) + "...";
};

const WordCountHistogram = memo(
export const WordCountHistogram = memo(
({
histogramData,
}: {
Expand Down Expand Up @@ -328,7 +328,7 @@ const WordCountHistogram = memo(
);
WordCountHistogram.displayName = "WordCountHistogram";

const CategoricalBarChart = memo(
export const CategoricalBarChart = memo(
({ data }: { data: { value: string; count: number }[] }) => {
// Memoize total count calculation
const totalCount = useMemo(
Expand Down Expand Up @@ -1339,6 +1339,7 @@ export default function ResizableDataTable<T extends Record<string, unknown>>({
}}
onJumpToRow={(index) => setCurrentValueIndex(index)}
currentOperation={currentOperation}
columnStats={columnStats[activeColumn]}
/>
)}
</div>
Expand Down
Loading

0 comments on commit acb655c

Please sign in to comment.