From cf46d9c923b1d720302dbb5b742749ec8a8bcbef Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Wed, 20 Nov 2024 17:19:59 -0800 Subject: [PATCH] feat: change histograms to be bar charts for categorical columns --- server/app/routes/convert.py | 36 +-- website/src/components/FileExplorer.tsx | 4 +- website/src/components/ResizableDataTable.tsx | 245 ++++++++++++++---- 3 files changed, 215 insertions(+), 70 deletions(-) diff --git a/server/app/routes/convert.py b/server/app/routes/convert.py index 2474c34d..a896bb37 100644 --- a/server/app/routes/convert.py +++ b/server/app/routes/convert.py @@ -43,23 +43,25 @@ def process_document_with_azure(file_path: str, endpoint: str, key: str) -> str: @router.post("/api/convert-documents") async def convert_documents(files: List[UploadFile] = File(...)): - # First try Modal endpoint - try: - async with aiohttp.ClientSession() as session: - # Prepare files for multipart upload - data = aiohttp.FormData() - for file in files: - data.add_field('files', - await file.read(), - filename=file.filename, - content_type=file.content_type) + # First try Modal endpoint if there are no txt files + all_txt_files = all(file.filename.lower().endswith('.txt') or file.filename.lower().endswith('.md') for file in files) + if not all_txt_files: + try: + async with aiohttp.ClientSession() as session: + # Prepare files for multipart upload + data = aiohttp.FormData() + for file in files: + data.add_field('files', + await file.read(), + filename=file.filename, + content_type=file.content_type) - async with session.post(MODAL_ENDPOINT, data=data, timeout=120) as response: - if response.status == 200: - return await response.json() + async with session.post(MODAL_ENDPOINT, data=data, timeout=120) as response: + if response.status == 200: + return await response.json() - except Exception as e: - print(f"Modal endpoint failed: {str(e)}. Falling back to local processing...") + except Exception as e: + print(f"Modal endpoint failed: {str(e)}. Falling back to local processing...") # If Modal fails, fall back to local processing from docling.document_converter import DocumentConverter @@ -70,7 +72,7 @@ async def convert_documents(files: List[UploadFile] = File(...)): # Save uploaded files to temporary directory file_paths = [] original_filenames = [] # Keep track of original filenames - txt_files = [] # Track which files are .txt + txt_files = [] # Track which files are .txt or markdown for file in files: # Reset file position since we might have read it in the Modal attempt await file.seek(0) @@ -82,7 +84,7 @@ async def convert_documents(files: List[UploadFile] = File(...)): buffer.write(content) file_paths.append(file_path) original_filenames.append(file.filename) - txt_files.append(file.filename.lower().endswith('.txt')) + txt_files.append(file.filename.lower().endswith('.txt') or file.filename.lower().endswith('.md')) # Convert all documents results = [] diff --git a/website/src/components/FileExplorer.tsx b/website/src/components/FileExplorer.tsx index 4755f3ca..395ac6c2 100644 --- a/website/src/components/FileExplorer.tsx +++ b/website/src/components/FileExplorer.tsx @@ -778,7 +778,7 @@ export const FileExplorer: React.FC = ({ type="file" multiple className="hidden" - accept=".pdf,.docx,.doc,.txt,.html,.pptx" + accept=".pdf,.docx,.doc,.txt,.html,.pptx,.md" onChange={(e) => { if (e.target.files) { handleFolderUpload(e.target.files); @@ -790,7 +790,7 @@ export const FileExplorer: React.FC = ({

- Supported formats: PDF, DOCX, DOC, TXT, HTML, PPTX + Supported formats: PDF, DOCX, DOC, TXT, HTML, PPTX, MD

Processing may take up to 2 minutes diff --git a/website/src/components/ResizableDataTable.tsx b/website/src/components/ResizableDataTable.tsx index d5d29fca..39b3e422 100644 --- a/website/src/components/ResizableDataTable.tsx +++ b/website/src/components/ResizableDataTable.tsx @@ -38,6 +38,7 @@ import { ArrowUpDown, ArrowUp, ArrowDown, + Search, } from "lucide-react"; import { DropdownMenu, @@ -48,9 +49,15 @@ import { import { TABLE_SETTINGS_KEY } from "@/app/localStorageKeys"; import ReactMarkdown from "react-markdown"; import debounce from "lodash/debounce"; -import { BarChart, Bar, XAxis, Tooltip, ResponsiveContainer } from "recharts"; +import { + BarChart, + Bar, + XAxis, + Tooltip, + ResponsiveContainer, + YAxis, +} from "recharts"; import { Input } from "@/components/ui/input"; -import { Search } from "lucide-react"; export type DataType = Record; export type ColumnType = ColumnDef & { @@ -65,6 +72,27 @@ interface ColumnStats { distribution: number[]; bucketSize: number; type: "number" | "array" | "string-words" | "string-chars" | "boolean"; + distinctCount: number; + totalCount: number; + isLowCardinality: boolean; + sortedValueCounts: { value: string; count: number }[]; +} + +function calculateDistinctValueCounts( + data: Record[], + accessor: string +): Map { + const valueCounts = new Map(); + + data.forEach((row) => { + const value = row[accessor]; + if (value != null) { + const key = typeof value === "object" ? JSON.stringify(value) : value; + valueCounts.set(key, (valueCounts.get(key) || 0) + 1); + } + }); + + return valueCounts; } function calculateColumnStats( @@ -139,6 +167,19 @@ function calculateColumnStats( const max = Math.max(...values); const avg = values.reduce((sum, val) => sum + val, 0) / values.length; + const valueCounts = calculateDistinctValueCounts(data, accessor); + const distinctCount = valueCounts.size; + const totalCount = data.filter((row) => row[accessor] != null).length; + const isLowCardinality = distinctCount < totalCount * 0.5; + + // Convert value counts to sorted array for bar chart + const sortedValueCounts = Array.from(valueCounts.entries()) + .sort((a, b) => b[1] - a[1]) // Sort by count in descending order + .map(([value, count]) => ({ + value: String(value), + count, + })); + // For boolean values, create a special two-bucket distribution if (type === "boolean") { const distribution = [0, 0]; // [false count, true count] @@ -152,6 +193,10 @@ function calculateColumnStats( distribution, bucketSize: 1, type, + distinctCount, + totalCount, + isLowCardinality, + sortedValueCounts, }; } @@ -164,6 +209,10 @@ function calculateColumnStats( distribution: [values.length], // Put all values in a single bucket bucketSize: 1, type, + distinctCount, + totalCount, + isLowCardinality, + sortedValueCounts, }; } @@ -188,6 +237,10 @@ function calculateColumnStats( distribution, bucketSize, type, + distinctCount, + totalCount, + isLowCardinality, + sortedValueCounts, }; } @@ -246,26 +299,69 @@ const WordCountHistogram = React.memo( ); WordCountHistogram.displayName = "WordCountHistogram"; +const CategoricalBarChart = React.memo( + ({ data }: { data: { value: string; count: number }[] }) => { + const totalCount = useMemo( + () => data.reduce((sum, item) => sum + item.count, 0), + [data] + ); + + // Take top 10 values for visualization + const displayData = data.slice(0, 10); + + return ( + + + + [ + `${value.toLocaleString()} (${( + (value / totalCount) * + 100 + ).toFixed(1)}%)`, + "Count", + ]} + labelFormatter={(label: string) => label} + contentStyle={{ + backgroundColor: "hsl(var(--popover))", + border: "1px solid hsl(var(--border))", + borderRadius: "var(--radius)", + color: "hsl(var(--popover-foreground))", + padding: "8px 12px", + boxShadow: "0 2px 4px rgba(0,0,0,0.1)", + }} + wrapperStyle={{ zIndex: 1000 }} + /> + + + + ); + } +); +CategoricalBarChart.displayName = "CategoricalBarChart"; + interface ColumnHeaderProps { header: string; stats: ColumnStats | null; isBold: boolean; - onSort: () => void; - sortDirection: "asc" | "desc" | false; onFilter: (value: string) => void; filterValue: string; } const ColumnHeader = React.memo( - ({ - header, - stats, - isBold, - onSort, - sortDirection, - onFilter, - filterValue, - }: ColumnHeaderProps) => { + ({ header, stats, isBold, onFilter, filterValue }: ColumnHeaderProps) => { const histogramData = useMemo(() => { if (!stats) return []; @@ -326,6 +422,9 @@ const ColumnHeader = React.memo( return (

+
+ {header} +
- {stats.min === stats.max ? ( + {stats.isLowCardinality ? ( - Single value: {stats.min} - {stats.type === "array" - ? " items" - : stats.type === "string-words" - ? " words" - : ""} + {stats.distinctCount} distinct values ) : ( <> @@ -378,7 +472,11 @@ const ColumnHeader = React.memo( )}
- + {stats.isLowCardinality ? ( + + ) : ( + + )}
)} @@ -703,7 +801,7 @@ function ResizableDataTable({ data, columns, boldedColumns, - startingRowHeight = 60, // Default starting height + startingRowHeight = 60, }: ResizableDataTableProps) { const [columnSizing, setColumnSizing] = useState(() => { const savedSettings = localStorage.getItem(TABLE_SETTINGS_KEY); @@ -870,34 +968,83 @@ function ResizableDataTable({ }, }); + const resetColumnWidths = useCallback(() => { + const initialSizing: ColumnSizingState = {}; + columns.forEach((column) => { + if (column.initialWidth) { + initialSizing[column.id as string] = column.initialWidth; + } else { + // Get all values for this column + const values = data.map((row) => { + const value = row[column.accessorKey as string]; + return value ? String(value) : ""; + }); + + // Estimate width based on content (including header) + const header = column.header as string; + const maxContentLength = Math.max( + header.length, + ...values.map((v) => v.length) + ); + + // Estimate width: ~8px per character, with min 150px and max 400px + const estimatedWidth = Math.min( + Math.max(maxContentLength * 8, 150), + 400 + ); + + initialSizing[column.id as string] = estimatedWidth; + } + }); + + // Update the table's column sizing state + table.setColumnSizing(initialSizing); + + // Update our local state and save settings + setColumnSizing(initialSizing); + saveSettings(); + }, [columns, data, saveSettings, table]); + return (
- - - - - - {table.getAllLeafColumns().map((column) => { - return ( - column.toggleVisibility(!!value)} - > - {column.id} - - ); - })} - - +
+ + + + + + {table.getAllLeafColumns().map((column) => { + return ( + + column.toggleVisibility(!!value) + } + > + {column.id} + + ); + })} + + + +
{data.length > 0 && (
@@ -959,10 +1106,6 @@ function ResizableDataTable({ isBold={boldedColumns.includes( header.column.columnDef.header as string )} - onSort={() => header.column.toggleSorting()} - sortDirection={ - header.column.getIsSorted() as false | "asc" | "desc" - } onFilter={(value) => header.column.setFilterValue(value)} filterValue={ (header.column.getFilterValue() as string) ?? ""