Skip to content

Commit

Permalink
feat: add docker file (#205)
Browse files Browse the repository at this point in the history
* feat: change histograms to be bar charts for categorical columns

* feat: add dockerfile

* feat: add dockerfile

* feat: add dockerfile

* feat: add dockerfile

* chore: add more descriptive error message for websocket failure

* fix: make docker ci work

* fix: add validation

* add documentation for docker container
  • Loading branch information
shreyashankar authored Nov 22, 2024
1 parent 26d02b5 commit 600e92a
Show file tree
Hide file tree
Showing 20 changed files with 699 additions and 263 deletions.
59 changes: 59 additions & 0 deletions .github/workflows/docker-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: Docker CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
docker-build-test:
runs-on: ubuntu-latest

steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v4
- name: Remove .env copy from Dockerfile
run: sed -i '/COPY .env/d' Dockerfile

- name: Build Docker image
run: docker build -t docetl .

- name: Create Docker volume
run: docker volume create docetl-data

- name: Test Docker container
run: |
# Run the container in detached mode
docker run -d \
-p 3000:3000 \
-p 8000:8000 \
-v docetl-data:/docetl-data \
-e FRONTEND_HOST=0.0.0.0 \
-e FRONTEND_PORT=3000 \
-e BACKEND_HOST=0.0.0.0 \
-e BACKEND_PORT=8000 \
--name docetl-test \
docetl
# Wait for container to start up
sleep 120
# Check if container is still running
if [ "$(docker ps -q -f name=docetl-test)" ]; then
echo "Container is running successfully"
else
echo "Container failed to stay running"
docker logs docetl-test
exit 1
fi
# Cleanup
docker stop docetl-test
docker rm docetl-test
- name: Clean up Docker volume
run: docker volume rm docetl-data
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,7 @@ website/.vercel

# typescript
website/*.tsbuildinfo
website/next-env.d.ts
website/next-env.d.ts

# Docker
.docker/
79 changes: 79 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Build stage for Python dependencies
FROM python:3.11-slim AS python-builder

RUN pip install poetry==1.4.2

ENV POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_IN_PROJECT=1 \
POETRY_VIRTUALENVS_CREATE=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache \
DOCETL_HOME_DIR="/docetl-data"

WORKDIR /app

COPY pyproject.toml poetry.lock ./
COPY docetl/ ./docetl/
COPY server/ ./server/
COPY tests/ ./tests/
RUN touch README.md

# Install with --no-root first for dependencies, then install with root for entrypoints
RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --all-extras --no-root && \
poetry install --all-extras

# Build stage for Node.js dependencies
FROM node:20-alpine AS node-builder

WORKDIR /app/website

# Update DOCETL_HOME_DIR to match final location
ENV DOCETL_HOME_DIR="/docetl-data"

COPY website/package*.json ./
RUN npm install
COPY website/ ./
RUN npm run build

# Final runtime stage
FROM python:3.11-slim AS runtime

# Install Node.js
RUN apt-get update && apt-get install -y \
curl \
&& curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
&& apt-get install -y nodejs \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Copy Python virtual environment from builder
ENV VIRTUAL_ENV=/app/.venv \
PATH="/app/.venv/bin:$PATH" \
PYTHONPATH="/app" \
DOCETL_HOME_DIR="/docetl-data"

COPY --from=python-builder /app/.venv ${VIRTUAL_ENV}

# Copy Python application files
COPY docetl/ ./docetl/
COPY server/ ./server/
COPY tests/ ./tests/
COPY pyproject.toml poetry.lock ./
COPY .env ./

# Copy Node.js dependencies and application files
COPY --from=node-builder /app/website ./website

ENV PORT=3000

# Create data directory with appropriate permissions
RUN mkdir -p /docetl-data && chown -R nobody:nogroup /docetl-data && chmod 777 /docetl-data

# Define volume AFTER creating and setting permissions
VOLUME ["/docetl-data"]

# Expose ports for frontend and backend
EXPOSE 3000 8000

# Start both servers
CMD ["sh", "-c", "python3 server/app/main.py & cd website && npm run start"]
26 changes: 23 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Load environment variables from .env file
include .env

.PHONY: tests tests-basic lint install mypy update ui-install ui-run
.PHONY: tests tests-basic lint install mypy update ui-install ui-run docker

# Existing commands
tests:
Expand All @@ -25,7 +25,7 @@ mypy:
update:
poetry update

# New UI-related commands
# UI-related commands
UI_DIR := ./website

install-ui:
Expand All @@ -43,6 +43,24 @@ run-ui:
echo "Building UI..." && \
cd $(UI_DIR) && npm run build && HOST=${FRONTEND_HOST} PORT=${FRONTEND_PORT} NEXT_PUBLIC_FRONTEND_ALLOWED_HOSTS=${FRONTEND_ALLOWED_HOSTS} npm run start

# Single Docker command to build and run
docker:
docker volume create docetl-data && \
docker build -t docetl . && \
docker run --rm -it \
-p 3000:3000 \
-p 8000:8000 \
-v docetl-data:/docetl-data \
-e FRONTEND_HOST=0.0.0.0 \
-e FRONTEND_PORT=3000 \
-e BACKEND_HOST=0.0.0.0 \
-e BACKEND_PORT=8000 \
docetl

# Add new command for cleaning up docker resources
docker-clean:
docker volume rm docetl-data

# Help command
help:
@echo "Available commands:"
Expand All @@ -54,5 +72,7 @@ help:
@echo " make update : Update dependencies"
@echo " make install-ui : Install UI dependencies"
@echo " make run-ui-dev : Run UI development server"
@echo " make run-ui-prod : Run UI production server"
@echo " make run-ui : Run UI production server"
@echo " make docker : Build and run docetl in Docker"
@echo " make docker-clean : Remove docetl Docker volume"
@echo " make help : Show this help message"
62 changes: 51 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,33 +33,74 @@ DocETL is the ideal choice when you're looking to maximize correctness and outpu

## Installation

### Prerequisites
There are three ways to run DocETL:

### 1. Using Docker (Recommended for Quick Start)

The easiest way to get started is using Docker:

1. Create the required environment files:

Create `.env` in the root directory:
```bash
OPENAI_API_KEY=your_api_key_here
BACKEND_ALLOW_ORIGINS=
BACKEND_HOST=0.0.0.0
BACKEND_PORT=8000
BACKEND_RELOAD=True
FRONTEND_HOST=0.0.0.0
FRONTEND_PORT=3000
```

Create `.env.local` in the `website` directory:
```bash
OPENAI_API_KEY=sk-xxx
OPENAI_API_BASE=https://api.openai.com/v1
MODEL_NAME=gpt-4o-mini

NEXT_PUBLIC_BACKEND_HOST=localhost
NEXT_PUBLIC_BACKEND_PORT=8000
```

2. Run Docker:
```bash
make docker
```

This will:
- Create a Docker volume for persistent data
- Build the DocETL image
- Run the container with the UI accessible at http://localhost:3000 and API at http://localhost:8000

To clean up Docker resources (note that this will delete the Docker volume):
```bash
make docker-clean
```

### 2. Using pip (Basic Installation)

If you just want to use DocETL as a Python package:

#### Prerequisites
- Python 3.10 or later
- OpenAI API key

### Quick Start

1. Install from PyPI:
```bash
pip install docetl
```

To see examples of how to use DocETL, check out the [tutorial](https://ucbepic.github.io/docetl/tutorial/).

### Running the UI Locally

We offer a simple UI for building pipelines. We recommend building up complex pipelines one operation at a time, so you can see the results of each operation as you go and iterate on your pipeline. To run it locally, follow these steps:
### 3. Running the UI Locally (Development Setup)

![Playground Screenshot](docs/assets/tutorial/playground-screenshot.png)
For development or if you want to run the UI locally:

1. Clone the repository:
```bash
git clone https://github.com/ucbepic/docetl.git
cd docetl
```


2. Set up environment variables in `.env` in the root/top-level directory:
```bash
OPENAI_API_KEY=your_api_key_here
Expand All @@ -72,7 +113,6 @@ FRONTEND_PORT=3000
```

And create an .env.local file in the `website` directory with the following:

```bash
OPENAI_API_KEY=sk-xxx
OPENAI_API_BASE=https://api.openai.com/v1
Expand All @@ -88,7 +128,7 @@ make install # Install Python package
make install-ui # Install UI dependencies
```

Note that the openai api key, base, and model name are for the UI assistant only; not the DocETL pipeline execution engine.
Note that the OpenAI API key, base, and model name are for the UI assistant only; not the DocETL pipeline execution engine.

4. Start the development server:
```bash
Expand Down
2 changes: 1 addition & 1 deletion docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def __init__(
self.resume = resume
self.captured_output = CapturedOutput()

home_dir = os.path.expanduser("~")
home_dir = os.environ.get("DOCETL_HOME_DIR", os.path.expanduser("~"))
cache_dir = os.path.join(home_dir, f".docetl/cache/{runner.yaml_file_suffix}")
os.makedirs(cache_dir, exist_ok=True)
self.datasets = DatasetOnDisk(dir=cache_dir, console=self.console)
Expand Down
3 changes: 2 additions & 1 deletion docetl/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ def get_operations():
"""Load all available operations and return them as a dictionary"""
return {
op.name: op.load()
for op in importlib.metadata.entry_points(group="docetl.operation")}
for op in importlib.metadata.entry_points(group="docetl.operation")
}
Loading

0 comments on commit 600e92a

Please sign in to comment.