From 600e92aa3150ca04e22debcaf2dae905e12243ce Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Fri, 22 Nov 2024 09:45:11 -0800 Subject: [PATCH] feat: add docker file (#205) * feat: change histograms to be bar charts for categorical columns * feat: add dockerfile * feat: add dockerfile * feat: add dockerfile * feat: add dockerfile * chore: add more descriptive error message for websocket failure * fix: make docker ci work * fix: add validation * add documentation for docker container --- .github/workflows/docker-ci.yml | 59 +++++ .gitignore | 5 +- Dockerfile | 79 ++++++ Makefile | 26 +- README.md | 62 ++++- docetl/builder.py | 2 +- docetl/operations/__init__.py | 3 +- docs/playground/index.md | 66 ++++- website/src/app/api/getInputOutput/route.ts | 16 +- .../src/app/api/getPipelineConfig/route.ts | 13 +- website/src/app/api/runPipeline/route.ts | 33 +-- website/src/app/api/saveDocuments/route.ts | 4 +- website/src/app/api/uploadFile/route.ts | 3 +- website/src/app/api/utils.ts | 3 +- .../src/app/api/writePipelineConfig/route.ts | 6 +- website/src/components/FileExplorer.tsx | 167 ++++++++---- website/src/components/ResizableDataTable.tsx | 243 +++++++++--------- website/src/components/operations/args.tsx | 86 ++++++- website/src/contexts/WebSocketContext.tsx | 62 ++++- website/src/hooks/use-toast.ts | 24 +- 20 files changed, 699 insertions(+), 263 deletions(-) create mode 100644 .github/workflows/docker-ci.yml create mode 100644 Dockerfile diff --git a/.github/workflows/docker-ci.yml b/.github/workflows/docker-ci.yml new file mode 100644 index 00000000..0dcebf87 --- /dev/null +++ b/.github/workflows/docker-ci.yml @@ -0,0 +1,59 @@ +name: Docker CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + docker-build-test: + runs-on: ubuntu-latest + + steps: + - name: Remove unnecessary files + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - uses: actions/checkout@v4 + - name: Remove .env copy from Dockerfile + run: sed -i '/COPY .env/d' Dockerfile + + - name: Build Docker image + run: docker build -t docetl . + + - name: Create Docker volume + run: docker volume create docetl-data + + - name: Test Docker container + run: | + # Run the container in detached mode + docker run -d \ + -p 3000:3000 \ + -p 8000:8000 \ + -v docetl-data:/docetl-data \ + -e FRONTEND_HOST=0.0.0.0 \ + -e FRONTEND_PORT=3000 \ + -e BACKEND_HOST=0.0.0.0 \ + -e BACKEND_PORT=8000 \ + --name docetl-test \ + docetl + + # Wait for container to start up + sleep 120 + + # Check if container is still running + if [ "$(docker ps -q -f name=docetl-test)" ]; then + echo "Container is running successfully" + else + echo "Container failed to stay running" + docker logs docetl-test + exit 1 + fi + + # Cleanup + docker stop docetl-test + docker rm docetl-test + + - name: Clean up Docker volume + run: docker volume rm docetl-data \ No newline at end of file diff --git a/.gitignore b/.gitignore index b1d0b6b1..c4834bfc 100644 --- a/.gitignore +++ b/.gitignore @@ -49,4 +49,7 @@ website/.vercel # typescript website/*.tsbuildinfo -website/next-env.d.ts \ No newline at end of file +website/next-env.d.ts + +# Docker +.docker/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..afddf042 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,79 @@ +# Build stage for Python dependencies +FROM python:3.11-slim AS python-builder + +RUN pip install poetry==1.4.2 + +ENV POETRY_NO_INTERACTION=1 \ + POETRY_VIRTUALENVS_IN_PROJECT=1 \ + POETRY_VIRTUALENVS_CREATE=1 \ + POETRY_CACHE_DIR=/tmp/poetry_cache \ + DOCETL_HOME_DIR="/docetl-data" + +WORKDIR /app + +COPY pyproject.toml poetry.lock ./ +COPY docetl/ ./docetl/ +COPY server/ ./server/ +COPY tests/ ./tests/ +RUN touch README.md + +# Install with --no-root first for dependencies, then install with root for entrypoints +RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --all-extras --no-root && \ + poetry install --all-extras + +# Build stage for Node.js dependencies +FROM node:20-alpine AS node-builder + +WORKDIR /app/website + +# Update DOCETL_HOME_DIR to match final location +ENV DOCETL_HOME_DIR="/docetl-data" + +COPY website/package*.json ./ +RUN npm install +COPY website/ ./ +RUN npm run build + +# Final runtime stage +FROM python:3.11-slim AS runtime + +# Install Node.js +RUN apt-get update && apt-get install -y \ + curl \ + && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy Python virtual environment from builder +ENV VIRTUAL_ENV=/app/.venv \ + PATH="/app/.venv/bin:$PATH" \ + PYTHONPATH="/app" \ + DOCETL_HOME_DIR="/docetl-data" + +COPY --from=python-builder /app/.venv ${VIRTUAL_ENV} + +# Copy Python application files +COPY docetl/ ./docetl/ +COPY server/ ./server/ +COPY tests/ ./tests/ +COPY pyproject.toml poetry.lock ./ +COPY .env ./ + +# Copy Node.js dependencies and application files +COPY --from=node-builder /app/website ./website + +ENV PORT=3000 + +# Create data directory with appropriate permissions +RUN mkdir -p /docetl-data && chown -R nobody:nogroup /docetl-data && chmod 777 /docetl-data + +# Define volume AFTER creating and setting permissions +VOLUME ["/docetl-data"] + +# Expose ports for frontend and backend +EXPOSE 3000 8000 + +# Start both servers +CMD ["sh", "-c", "python3 server/app/main.py & cd website && npm run start"] \ No newline at end of file diff --git a/Makefile b/Makefile index bc147709..f392632e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Load environment variables from .env file include .env -.PHONY: tests tests-basic lint install mypy update ui-install ui-run +.PHONY: tests tests-basic lint install mypy update ui-install ui-run docker # Existing commands tests: @@ -25,7 +25,7 @@ mypy: update: poetry update -# New UI-related commands +# UI-related commands UI_DIR := ./website install-ui: @@ -43,6 +43,24 @@ run-ui: echo "Building UI..." && \ cd $(UI_DIR) && npm run build && HOST=${FRONTEND_HOST} PORT=${FRONTEND_PORT} NEXT_PUBLIC_FRONTEND_ALLOWED_HOSTS=${FRONTEND_ALLOWED_HOSTS} npm run start +# Single Docker command to build and run +docker: + docker volume create docetl-data && \ + docker build -t docetl . && \ + docker run --rm -it \ + -p 3000:3000 \ + -p 8000:8000 \ + -v docetl-data:/docetl-data \ + -e FRONTEND_HOST=0.0.0.0 \ + -e FRONTEND_PORT=3000 \ + -e BACKEND_HOST=0.0.0.0 \ + -e BACKEND_PORT=8000 \ + docetl + +# Add new command for cleaning up docker resources +docker-clean: + docker volume rm docetl-data + # Help command help: @echo "Available commands:" @@ -54,5 +72,7 @@ help: @echo " make update : Update dependencies" @echo " make install-ui : Install UI dependencies" @echo " make run-ui-dev : Run UI development server" - @echo " make run-ui-prod : Run UI production server" + @echo " make run-ui : Run UI production server" + @echo " make docker : Build and run docetl in Docker" + @echo " make docker-clean : Remove docetl Docker volume" @echo " make help : Show this help message" \ No newline at end of file diff --git a/README.md b/README.md index f4da9e76..470cb694 100644 --- a/README.md +++ b/README.md @@ -33,25 +33,67 @@ DocETL is the ideal choice when you're looking to maximize correctness and outpu ## Installation -### Prerequisites +There are three ways to run DocETL: +### 1. Using Docker (Recommended for Quick Start) + +The easiest way to get started is using Docker: + +1. Create the required environment files: + +Create `.env` in the root directory: +```bash +OPENAI_API_KEY=your_api_key_here +BACKEND_ALLOW_ORIGINS= +BACKEND_HOST=0.0.0.0 +BACKEND_PORT=8000 +BACKEND_RELOAD=True +FRONTEND_HOST=0.0.0.0 +FRONTEND_PORT=3000 +``` + +Create `.env.local` in the `website` directory: +```bash +OPENAI_API_KEY=sk-xxx +OPENAI_API_BASE=https://api.openai.com/v1 +MODEL_NAME=gpt-4o-mini + +NEXT_PUBLIC_BACKEND_HOST=localhost +NEXT_PUBLIC_BACKEND_PORT=8000 +``` + +2. Run Docker: +```bash +make docker +``` + +This will: +- Create a Docker volume for persistent data +- Build the DocETL image +- Run the container with the UI accessible at http://localhost:3000 and API at http://localhost:8000 + +To clean up Docker resources (note that this will delete the Docker volume): +```bash +make docker-clean +``` + +### 2. Using pip (Basic Installation) + +If you just want to use DocETL as a Python package: + +#### Prerequisites - Python 3.10 or later - OpenAI API key -### Quick Start - -1. Install from PyPI: ```bash pip install docetl ``` To see examples of how to use DocETL, check out the [tutorial](https://ucbepic.github.io/docetl/tutorial/). -### Running the UI Locally - -We offer a simple UI for building pipelines. We recommend building up complex pipelines one operation at a time, so you can see the results of each operation as you go and iterate on your pipeline. To run it locally, follow these steps: +### 3. Running the UI Locally (Development Setup) -![Playground Screenshot](docs/assets/tutorial/playground-screenshot.png) +For development or if you want to run the UI locally: 1. Clone the repository: ```bash @@ -59,7 +101,6 @@ git clone https://github.com/ucbepic/docetl.git cd docetl ``` - 2. Set up environment variables in `.env` in the root/top-level directory: ```bash OPENAI_API_KEY=your_api_key_here @@ -72,7 +113,6 @@ FRONTEND_PORT=3000 ``` And create an .env.local file in the `website` directory with the following: - ```bash OPENAI_API_KEY=sk-xxx OPENAI_API_BASE=https://api.openai.com/v1 @@ -88,7 +128,7 @@ make install # Install Python package make install-ui # Install UI dependencies ``` -Note that the openai api key, base, and model name are for the UI assistant only; not the DocETL pipeline execution engine. +Note that the OpenAI API key, base, and model name are for the UI assistant only; not the DocETL pipeline execution engine. 4. Start the development server: ```bash diff --git a/docetl/builder.py b/docetl/builder.py index 7a8c4766..2ccf7d6a 100644 --- a/docetl/builder.py +++ b/docetl/builder.py @@ -140,7 +140,7 @@ def __init__( self.resume = resume self.captured_output = CapturedOutput() - home_dir = os.path.expanduser("~") + home_dir = os.environ.get("DOCETL_HOME_DIR", os.path.expanduser("~")) cache_dir = os.path.join(home_dir, f".docetl/cache/{runner.yaml_file_suffix}") os.makedirs(cache_dir, exist_ok=True) self.datasets = DatasetOnDisk(dir=cache_dir, console=self.console) diff --git a/docetl/operations/__init__.py b/docetl/operations/__init__.py index 66f91032..43d567b1 100644 --- a/docetl/operations/__init__.py +++ b/docetl/operations/__init__.py @@ -15,4 +15,5 @@ def get_operations(): """Load all available operations and return them as a dictionary""" return { op.name: op.load() - for op in importlib.metadata.entry_points(group="docetl.operation")} + for op in importlib.metadata.entry_points(group="docetl.operation") + } diff --git a/docs/playground/index.md b/docs/playground/index.md index a79c29d7..89f2eaee 100644 --- a/docs/playground/index.md +++ b/docs/playground/index.md @@ -10,15 +10,63 @@ The playground allows you to do just that. ## Installation +There are two ways to run the playground: + +### 1. Using Docker (Recommended for Quick Start) + +The easiest way to get started is using Docker: + +1. Create the required environment files: + +Create `.env` in the root directory: +```bash +OPENAI_API_KEY=your_api_key_here +BACKEND_ALLOW_ORIGINS= +BACKEND_HOST=localhost +BACKEND_PORT=8000 +BACKEND_RELOAD=True +FRONTEND_HOST=localhost +FRONTEND_PORT=3000 +``` + +Create `.env.local` in the `website` directory: +```bash +OPENAI_API_KEY=sk-xxx +OPENAI_API_BASE=https://api.openai.com/v1 +MODEL_NAME=gpt-4o-mini + +NEXT_PUBLIC_BACKEND_HOST=localhost +NEXT_PUBLIC_BACKEND_PORT=8000 +``` + +2. Run Docker: +```bash +make docker +``` + +This will: +- Create a Docker volume for persistent data +- Build the DocETL image +- Run the container with the UI accessible at http://localhost:3000 and API at http://localhost:8000 + +To clean up Docker resources (note that this will delete the Docker volume): +```bash +make docker-clean +``` + +### 2. Running Locally (Development Setup) + +For development or if you want to run the UI locally: + 1. Clone the repository: ```bash git clone https://github.com/ucbepic/docetl.git cd docetl ``` -2. Set up environment variables by creating a `.env` file in the root directory: +2. Set up environment variables in `.env` in the root directory: ```bash -OPENAI_API_KEY=your_api_key_here # Or any other llm keys +OPENAI_API_KEY=your_api_key_here BACKEND_ALLOW_ORIGINS= BACKEND_HOST=localhost BACKEND_PORT=8000 @@ -27,13 +75,11 @@ FRONTEND_HOST=0.0.0.0 FRONTEND_PORT=3000 ``` -The `.env` file is used for the backend server. - -For the front end, create an `.env.local` file in the `website` directory with: +Create `.env.local` in the `website` directory: ```bash OPENAI_API_KEY=sk-xxx OPENAI_API_BASE=https://api.openai.com/v1 -MODEL_NAME=gpt-4-mini +MODEL_NAME=gpt-4o-mini NEXT_PUBLIC_BACKEND_HOST=localhost NEXT_PUBLIC_BACKEND_PORT=8000 @@ -53,7 +99,7 @@ make install-ui # Install UI dependencies make run-ui-dev ``` -5. Navigate to [http://localhost:3000/playground](http://localhost:3000/playground) to access the playground. +5. Navigate to http://localhost:3000/playground to access the playground. ### Setting up the AI Assistant @@ -61,12 +107,6 @@ The UI offers an optional chat-based assistant that can help you iteratively dev To use the assistant, you need to set your OpenAI API key in the `.env.local` file in the website directory. You can get an API key [here](https://platform.openai.com/api-keys). The API key should be in the following format: `sk-proj-...`. We only support the openai models for the assistant. -Your `.env.local` file should look like this: - -``` -OPENAI_API_KEY=sk-proj-... -``` - ## Complex Tutorial See this [YouTube video](https://www.youtube.com/watch?v=IlgueVqtHGo) for a more in depth tutorial on how to use the playground. \ No newline at end of file diff --git a/website/src/app/api/getInputOutput/route.ts b/website/src/app/api/getInputOutput/route.ts index 02f5c90f..897bc44e 100644 --- a/website/src/app/api/getInputOutput/route.ts +++ b/website/src/app/api/getInputOutput/route.ts @@ -1,7 +1,7 @@ import { NextResponse } from "next/server"; import { generatePipelineConfig } from "@/app/api/utils"; import fs from "fs/promises"; - +import os from "os"; export async function POST(request: Request) { try { const { default_model, data, operations, operation_id, name, sample_size } = @@ -10,24 +10,26 @@ export async function POST(request: Request) { if (!name) { return NextResponse.json( { error: "Pipeline name is required" }, - { status: 400 }, + { status: 400 } ); } if (!data) { return NextResponse.json( { error: "Data is required. Please select a file in the sidebar." }, - { status: 400 }, + { status: 400 } ); } + const homeDir = process.env.DOCETL_HOME_DIR || os.homedir(); const { inputPath, outputPath } = generatePipelineConfig( default_model, data, operations, operation_id, name, - sample_size, + homeDir, + sample_size ); // Check if inputPath exists @@ -37,7 +39,7 @@ export async function POST(request: Request) { console.error(`Input path does not exist: ${inputPath}`); return NextResponse.json( { error: "Input path does not exist" }, - { status: 400 }, + { status: 400 } ); } @@ -48,7 +50,7 @@ export async function POST(request: Request) { console.error(`Output path does not exist: ${outputPath}`); return NextResponse.json( { error: "Output path does not exist" }, - { status: 400 }, + { status: 400 } ); } @@ -57,7 +59,7 @@ export async function POST(request: Request) { console.error(error); return NextResponse.json( { error: "Failed to get input and output paths" }, - { status: 500 }, + { status: 500 } ); } } diff --git a/website/src/app/api/getPipelineConfig/route.ts b/website/src/app/api/getPipelineConfig/route.ts index f0f7c8c1..908a2df4 100644 --- a/website/src/app/api/getPipelineConfig/route.ts +++ b/website/src/app/api/getPipelineConfig/route.ts @@ -1,6 +1,6 @@ import { NextResponse } from "next/server"; import { generatePipelineConfig } from "@/app/api/utils"; - +import os from "os"; export async function POST(request: Request) { try { const { default_model, data, operations, operation_id, name, sample_size } = @@ -9,24 +9,27 @@ export async function POST(request: Request) { if (!name) { return NextResponse.json( { error: "Pipeline name is required" }, - { status: 400 }, + { status: 400 } ); } if (!data) { return NextResponse.json( { error: "Data is required. Please select a file in the sidebar." }, - { status: 400 }, + { status: 400 } ); } + const homeDir = process.env.DOCETL_HOME_DIR || os.homedir(); + const { yamlString } = generatePipelineConfig( default_model, data, operations, operation_id, name, - sample_size, + homeDir, + sample_size ); return NextResponse.json({ pipelineConfig: yamlString }); @@ -34,7 +37,7 @@ export async function POST(request: Request) { console.error(error); return NextResponse.json( { error: "Failed to generate pipeline configuration" }, - { status: 500 }, + { status: 500 } ); } } diff --git a/website/src/app/api/runPipeline/route.ts b/website/src/app/api/runPipeline/route.ts index ed505000..2c1726a3 100644 --- a/website/src/app/api/runPipeline/route.ts +++ b/website/src/app/api/runPipeline/route.ts @@ -14,19 +14,19 @@ export async function POST(request: Request) { if (!name) { return NextResponse.json( { error: "Pipeline name is required" }, - { status: 400 }, + { status: 400 } ); } if (!data) { return NextResponse.json( { error: "Data is required. Please select a file in the sidebar." }, - { status: 400 }, + { status: 400 } ); } // Create pipeline configuration based on tutorial.yaml example - const homeDir = os.homedir(); + const homeDir = process.env.DOCETL_HOME_DIR || os.homedir(); const datasets = { input: { @@ -59,7 +59,7 @@ export async function POST(request: Request) { if (item.type === "list") { if (!item.subType) { throw new Error( - `List type must specify its elements for field: ${item.key}`, + `List type must specify its elements for field: ${item.key}` ); } const subType = @@ -70,7 +70,7 @@ export async function POST(request: Request) { } else if (item.type === "dict") { if (!item.subType) { throw new Error( - `Dict/Object type must specify its structure for field: ${item.key}`, + `Dict/Object type must specify its structure for field: ${item.key}` ); } const subSchema = Object.entries(item.subType).reduce( @@ -78,7 +78,7 @@ export async function POST(request: Request) { acc[key] = processSchemaItem(value as SchemaItem); return acc; }, - {} as Record, + {} as Record ); return JSON.stringify(subSchema); } else { @@ -94,17 +94,17 @@ export async function POST(request: Request) { acc[item.key] = processSchemaItem(item); return acc; }, - {}, + {} ), }, }; - }, + } ); // Fetch all operations up until and including the operation_id const operationsToRun = operations.slice( 0, - operations.findIndex((op: Operation) => op.id === operation_id) + 1, + operations.findIndex((op: Operation) => op.id === operation_id) + 1 ); const pipelineConfig = { @@ -126,14 +126,14 @@ export async function POST(request: Request) { ".docetl", "pipelines", "outputs", - `${name}.json`, + `${name}.json` ), intermediate_dir: path.join( homeDir, ".docetl", "pipelines", name, - "intermediates", + "intermediates" ), }, }, @@ -163,9 +163,12 @@ export async function POST(request: Request) { // Submit the YAML config to the FastAPI endpoint - const response = await axios.post(`http://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${process.env.NEXT_PUBLIC_BACKEND_PORT}/run_pipeline`, { - yaml_config: filePath, - }); + const response = await axios.post( + `http://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${process.env.NEXT_PUBLIC_BACKEND_PORT}/run_pipeline`, + { + yaml_config: filePath, + } + ); return NextResponse.json({ message: "Pipeline YAML created and submitted successfully", @@ -189,7 +192,7 @@ export async function POST(request: Request) { } return NextResponse.json( { error: `Failed to run pipeline YAML: ${errorMessage}` }, - { status: 500 }, + { status: 500 } ); } } diff --git a/website/src/app/api/saveDocuments/route.ts b/website/src/app/api/saveDocuments/route.ts index f1cb8c36..6e60913f 100644 --- a/website/src/app/api/saveDocuments/route.ts +++ b/website/src/app/api/saveDocuments/route.ts @@ -18,8 +18,10 @@ export async function POST(request: NextRequest) { return NextResponse.json({ error: "No files provided" }, { status: 400 }); } + const homeDir = process.env.DOCETL_HOME_DIR || os.homedir(); + // Create uploads directory in user's home directory if it doesn't exist - const uploadsDir = path.join(os.homedir(), ".docetl", "documents"); + const uploadsDir = path.join(homeDir, ".docetl", "documents"); await mkdir(uploadsDir, { recursive: true }); const savedFiles = await Promise.all( diff --git a/website/src/app/api/uploadFile/route.ts b/website/src/app/api/uploadFile/route.ts index ded1dfaa..d8838d5c 100644 --- a/website/src/app/api/uploadFile/route.ts +++ b/website/src/app/api/uploadFile/route.ts @@ -18,7 +18,8 @@ export async function POST(request: NextRequest) { const buffer = Buffer.from(bytes); // Create uploads directory in user's home directory if it doesn't exist - const uploadDir = path.join(os.homedir(), ".docetl", "files"); + const homeDir = process.env.DOCETL_HOME_DIR || os.homedir(); + const uploadDir = path.join(homeDir, ".docetl", "files"); await mkdir(uploadDir, { recursive: true }); // Create full file path diff --git a/website/src/app/api/utils.ts b/website/src/app/api/utils.ts index 1e359f34..150d05e8 100644 --- a/website/src/app/api/utils.ts +++ b/website/src/app/api/utils.ts @@ -9,12 +9,11 @@ export function generatePipelineConfig( operations: Operation[], operation_id: string, name: string, + homeDir: string, sample_size: number | null, optimize: boolean = false, clear_intermediate: boolean = false ) { - const homeDir = os.homedir(); - const datasets = { input: { type: "file", diff --git a/website/src/app/api/writePipelineConfig/route.ts b/website/src/app/api/writePipelineConfig/route.ts index d39726e5..21e57ddd 100644 --- a/website/src/app/api/writePipelineConfig/route.ts +++ b/website/src/app/api/writePipelineConfig/route.ts @@ -31,21 +31,21 @@ export async function POST(request: Request) { ); } + const homeDir = process.env.DOCETL_HOME_DIR || os.homedir(); + const { yamlString, inputPath, outputPath } = generatePipelineConfig( default_model, data, operations, operation_id, name, + homeDir, sample_size, optimize, clear_intermediate ); - console.log(yamlString); - // Save the YAML file in the user's home directory - const homeDir = os.homedir(); const pipelineDir = path.join(homeDir, ".docetl", "pipelines"); const configDir = path.join(pipelineDir, "configs"); const nameDir = path.join(pipelineDir, name, "intermediates"); diff --git a/website/src/components/FileExplorer.tsx b/website/src/components/FileExplorer.tsx index 395ac6c2..2f57b781 100644 --- a/website/src/components/FileExplorer.tsx +++ b/website/src/components/FileExplorer.tsx @@ -82,8 +82,13 @@ function mergeFileList( return dt.files; } -async function getAllFiles(entry: FileSystemEntry): Promise { - const files: File[] = []; +// Add this type to handle File with relativePath +interface FileWithPath extends File { + relativePath?: string; +} + +async function getAllFiles(entry: FileSystemEntry): Promise { + const files: FileWithPath[] = []; async function processEntry( entry: FileSystemEntry, @@ -92,7 +97,7 @@ async function getAllFiles(entry: FileSystemEntry): Promise { if (entry.isFile) { const fileEntry = entry as FileSystemFileEntry; const file = await new Promise((resolve, reject) => { - // @ts-ignore + // @ts-expect-error FileSystemFileEntry type definitions are incomplete fileEntry.file(resolve, reject); }); @@ -110,10 +115,11 @@ async function getAllFiles(entry: FileSystemEntry): Promise { ) { // Create a new file with the full path const fullPath = path ? `${path}/${file.name}` : file.name; - // @ts-ignore - const newFile = new File([file], fullPath, { type: file.type }); + // @ts-expect-error File constructor with path is not in type definitions + const newFile = new File([file], fullPath, { + type: file.type, + }) as FileWithPath; Object.defineProperty(newFile, "relativePath", { value: fullPath }); - // @ts-ignore files.push(newFile); } } else if (entry.isDirectory) { @@ -137,6 +143,55 @@ async function getAllFiles(entry: FileSystemEntry): Promise { type ConversionMethod = "docling" | "azure"; +interface UploadedDataset { + [key: string]: unknown; +} + +async function validateJsonDataset(file: Blob): Promise { + const text = await file.text(); + let data: unknown; + + try { + data = JSON.parse(text); + } catch { + throw new Error("Invalid JSON format"); + } + + // Check if it's an array + if (!Array.isArray(data)) { + throw new Error( + "Dataset must be an array of objects, like this: [{key: value}, {key: value}]" + ); + } + + // Check if array is not empty + if (data.length === 0) { + throw new Error("Dataset cannot be empty"); + } + + // Check if first item is an object + if (typeof data[0] !== "object" || data[0] === null) { + throw new Error("Dataset must contain objects"); + } + + // Get keys of first object + const firstObjectKeys = Object.keys(data[0]).sort(); + + // Check if all objects have the same keys + const hasConsistentKeys = data.every((item) => { + if (typeof item !== "object" || item === null) return false; + const currentKeys = Object.keys(item).sort(); + return ( + currentKeys.length === firstObjectKeys.length && + currentKeys.every((key, index) => key === firstObjectKeys[index]) + ); + }); + + if (!hasConsistentKeys) { + throw new Error("All objects in dataset must have the same keys"); + } +} + export const FileExplorer: React.FC = ({ files, onFileClick, @@ -187,57 +242,62 @@ export const FileExplorer: React.FC = ({ return; } - if (uploadedFile.type === "application/json") { - setUploadingFiles((prev) => new Set(prev).add(uploadedFile.name)); + if (!uploadedFile.name.toLowerCase().endsWith(".json")) { + toast({ + variant: "destructive", + title: "Error", + description: "Please upload a JSON file", + }); + return; + } + + setUploadingFiles((prev) => new Set(prev).add(uploadedFile.name)); + + try { + // Validate JSON structure before uploading + await validateJsonDataset(uploadedFile); const formData = new FormData(); formData.append("file", uploadedFile); - try { - const response = await fetch("/api/uploadFile", { - method: "POST", - body: formData, - }); + const response = await fetch("/api/uploadFile", { + method: "POST", + body: formData, + }); - if (!response.ok) { - throw new Error("Upload failed"); - } + if (!response.ok) { + throw new Error("Upload failed"); + } - const data = await response.json(); + const data = await response.json(); - const newFile = { - name: uploadedFile.name, - path: data.path, - type: "json" as const, - parentFolder: "root", - }; + const newFile = { + name: uploadedFile.name, + path: data.path, + type: "json" as const, + parentFolder: "root", + }; + + onFileUpload(newFile); + setCurrentFile(newFile); - onFileUpload(newFile); - setCurrentFile(newFile); - - toast({ - title: "Success", - description: "Dataset uploaded successfully", - }); - } catch (error) { - console.error(error); - toast({ - variant: "destructive", - title: "Error", - description: "Failed to upload file", - }); - } finally { - setUploadingFiles((prev) => { - const next = new Set(prev); - next.delete(uploadedFile.name); - return next; - }); - } - } else { + toast({ + title: "Success", + description: "Dataset uploaded successfully", + }); + } catch (error) { + console.error(error); toast({ variant: "destructive", title: "Error", - description: "Please upload a JSON file", + description: + error instanceof Error ? error.message : "Failed to upload file", + }); + } finally { + setUploadingFiles((prev) => { + const next = new Set(prev); + next.delete(uploadedFile.name); + return next; }); } }; @@ -250,16 +310,16 @@ export const FileExplorer: React.FC = ({ const handleFolderUpload = async ( fileList: FileList | DataTransferItemList ) => { - const files: File[] = []; + const files: FileWithPath[] = []; const processItems = async () => { - // @ts-ignore + // @ts-expect-error DataTransferItemList doesn't have proper type support const items = Array.from(fileList); for (const item of items) { if ("webkitGetAsEntry" in item) { // Handle drag and drop - // @ts-ignore + // @ts-expect-error webkitGetAsEntry is not in type definitions const entry = (item as DataTransferItem).webkitGetAsEntry(); if (entry) { const entryFiles = await getAllFiles(entry); @@ -267,8 +327,8 @@ export const FileExplorer: React.FC = ({ } } else { // Handle regular file input - // @ts-ignore - const file = item as File; + // @ts-expect-error FileList type conversion needs explicit cast + const file = item as FileWithPath; const supportedExtensions = [ ".pdf", ".docx", @@ -292,7 +352,7 @@ export const FileExplorer: React.FC = ({ // Create a new FileList-like object with the collected files const dt = new DataTransfer(); - // @ts-ignore + // @ts-expect-error DataTransfer.items.add type is incomplete files.forEach((file) => dt.items.add(file)); setSelectedFiles((prevFiles) => mergeFileList(prevFiles, dt.files)); }; @@ -811,7 +871,8 @@ export const FileExplorer: React.FC = ({

- {(file as any).relativePath || file.name} + {/* @ts-ignore */} + {(file as FileWithPath).relativePath || file.name}

diff --git a/website/src/components/ResizableDataTable.tsx b/website/src/components/ResizableDataTable.tsx index 961f0a19..26e50c73 100644 --- a/website/src/components/ResizableDataTable.tsx +++ b/website/src/components/ResizableDataTable.tsx @@ -31,15 +31,7 @@ import { TableRow, } from "@/components/ui/table"; import { Button } from "@/components/ui/button"; -import { - ChevronLeft, - ChevronRight, - ChevronDown, - ArrowUpDown, - ArrowUp, - ArrowDown, - Search, -} from "lucide-react"; +import { ChevronLeft, ChevronRight, ChevronDown, Search } from "lucide-react"; import { DropdownMenu, DropdownMenuCheckboxItem, @@ -49,14 +41,7 @@ import { import { TABLE_SETTINGS_KEY } from "@/app/localStorageKeys"; import ReactMarkdown from "react-markdown"; import debounce from "lodash/debounce"; -import { - BarChart, - Bar, - XAxis, - Tooltip, - ResponsiveContainer, - YAxis, -} from "recharts"; +import { BarChart, Bar, XAxis, Tooltip, ResponsiveContainer } from "recharts"; import { Input } from "@/components/ui/input"; export type DataType = Record; @@ -87,8 +72,12 @@ function calculateDistinctValueCounts( data.forEach((row) => { const value = row[accessor]; if (value != null) { - const key = typeof value === "object" ? JSON.stringify(value) : String(value); - valueCounts.set(key, (valueCounts.get(key) || 0) + 1); + const key = + typeof value === "object" ? JSON.stringify(value) : String(value); + valueCounts.set( + key as string | number | boolean, + (valueCounts.get(key as string | number | boolean) || 0) + 1 + ); } }); @@ -956,7 +945,6 @@ function ResizableDataTable({ defaultColumn: { minSize: 30, size: 150, - maxSize: Number.MAX_SAFE_INTEGER, }, initialState: { pagination: { @@ -1074,116 +1062,127 @@ function ResizableDataTable({ )}

- - - {table.getHeaderGroups().map((headerGroup) => ( - - # - {headerGroup.headers.map((header) => ( - - {header.isPlaceholder ? null : ( - header.column.setFilterValue(value)} - filterValue={ - (header.column.getFilterValue() as string) ?? "" - } - /> - )} - +
+
+ + {table.getHeaderGroups().map((headerGroup) => ( + + + # - ))} - - ))} - - - {table.getRowModel().rows.map((row, index) => ( - - - - - {row.index + 1} - - - {row.getVisibleCells().map((cell) => ( + {headerGroup.headers.map((header) => ( + + {header.isPlaceholder ? null : ( + + header.column.setFilterValue(value) + } + filterValue={ + (header.column.getFilterValue() as string) ?? "" + } + /> + )} + + + ))} + + ))} + + + + {table.getRowModel().rows.map((row, index) => ( + + -
+ {row.index + 1} + + + {row.getVisibleCells().map((cell) => ( + - {typeof cell.getValue() === "string" ? ( - - ) : ( - flexRender( - cell.column.columnDef.cell, - cell.getContext() - ) - )} -
-
- ))} -
- rowSizing[index] || startingRowHeight, - setSize: (size: number) => { - setRowSizing((prev) => { - const newRowSizing = { ...prev, [index]: size }; - saveSettings(); - return newRowSizing; - }); - }, - }} - /> -
- ))} -
-
+
+ {typeof cell.getValue() === "string" ? ( + + ) : ( + flexRender( + cell.column.columnDef.cell, + cell.getContext() + ) + )} +
+ + ))} + + rowSizing[index] || startingRowHeight, + setSize: (size: number) => { + setRowSizing((prev) => { + const newRowSizing = { ...prev, [index]: size }; + saveSettings(); + return newRowSizing; + }); + }, + }} + /> + + ))} + + + + {data.length > 0 && (