Skip to content

Commit

Permalink
feat!: add LayoutLMv3 model and restructure project architecture
Browse files Browse the repository at this point in the history
Major changes:
- Integrate LayoutLMv3 model for document classification
- Add new predictor container for model inference
- Create dedicated data models and configurations using dataclasses
- Implement proper model versioning and persistence
- Optimize Docker builds with UV package manager
- Set up volume bindings for models and logs persistence
- Reorganize code for better maintainability and testing
- Add proper error handling and logging
- Implement state management for processing pipeline
- Add proper type hints and documentation

Infrastructure improvements:
- Replace pip with UV for faster package installation
- Add bind mounts for logs and model artifacts
- Implement multi-stage Docker builds
- Configure proper networking between services
- Set up development environment with PDM
- Add start_services.sh script for easy deployment and initialization

BREAKING CHANGE: Complete architecture redesign with new model integration and container structure.
  • Loading branch information
c0deplayer committed Nov 17, 2024
1 parent 09e1066 commit 8859503
Show file tree
Hide file tree
Showing 24 changed files with 1,804 additions and 817 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ jobs:
dockerfile: ./Dockerfile.ocr
- image: c0deplayer/dc-processor
dockerfile: ./Dockerfile.processor
- image: c0deplayer/dc-predictor
dockerfile: ./Dockerfile.predictor

steps:
- name: Checkout
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,5 +162,7 @@ cython_debug/
#.idea/

# Other
.DS_Store
logs/
__*.py
__*.py
models/
28 changes: 22 additions & 6 deletions Dockerfile.ocr
Original file line number Diff line number Diff line change
@@ -1,20 +1,36 @@
FROM python:3.12-slim
FROM ghcr.io/astral-sh/uv:latest AS uv
FROM python:3.12-slim AS python

LABEL authors="codeplayer"

WORKDIR /code
ENV VIRTUAL_ENV=/opt/venv

WORKDIR /app/data/ocr

# Update and upgrade the system
RUN apt update -y && \
apt upgrade -y \
# Install required packages
&& apt install poppler-utils -y \
# cleanup
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /var/lib/apt/lists

COPY ./requirements.txt /code/requirements.txt
RUN \
# we use a cache --mount to reuse the uv cache across builds
--mount=type=cache,target=/root/.cache/uv \
# we use a bind --mount to use the uv binary from the uv stage
--mount=type=bind,from=uv,source=/uv,target=/uv \
# we use a bind --mount to use the requirements.txt from the host instead of adding a COPY layer
--mount=type=bind,source=requirements.txt,target=requirements.txt \
/uv venv /opt/venv && \
/uv pip install -r requirements.txt

RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
WORKDIR /app/code/ocr

COPY ./src/documentclassification/ocr/ /code/documentclassification/ocr/
COPY src/ocr/ .
COPY src/configs/ocr_config.py configs/ocr_config.py
COPY src/payload/ocr_models.py payload/ocr_models.py

CMD ["uvicorn", "documentclassification.ocr.ocr:app", "--host", "0.0.0.0", "--port", "8080"]
CMD ["/opt/venv/bin/python", "-m", "uvicorn", "ocr:app", "--host", "0.0.0.0", "--port", "8080"]
32 changes: 32 additions & 0 deletions Dockerfile.predictor
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
FROM ghcr.io/astral-sh/uv:latest AS uv
FROM python:3.12-slim AS python

ENV VIRTUAL_ENV=/opt/venv

WORKDIR /app/data/predictor

# Update and upgrade the system
RUN apt update -y && \
apt upgrade -y \
# cleanup
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /var/lib/apt/lists

RUN \
# we use a cache --mount to reuse the uv cache across builds
--mount=type=cache,target=/root/.cache/uv \
# we use a bind --mount to use the uv binary from the uv stage
--mount=type=bind,from=uv,source=/uv,target=/uv \
# we use a bind --mount to use the requirements.txt from the host instead of adding a COPY layer
--mount=type=bind,source=requirements.txt,target=requirements.txt \
/uv venv /opt/venv && \
/uv pip install -r requirements.txt

WORKDIR /app/code/predictor

COPY src/predictor/ .
COPY src/configs/model_config.py configs/model_config.py
COPY src/payload/model_models.py payload/model_models.py

CMD ["/opt/venv/bin/python","-m", "uvicorn", "model:app", "--host", "0.0.0.0", "--port", "7070"]
26 changes: 20 additions & 6 deletions Dockerfile.processor
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
FROM python:3.12-slim
FROM ghcr.io/astral-sh/uv:latest AS uv
FROM python:3.12-slim AS python

LABEL authors="codeplayer"

WORKDIR /code
ENV VIRTUAL_ENV=/opt/venv

# RUN mkdir -p /app/data/logs/processor

# Update and upgrade the system
RUN apt update -y && \
Expand All @@ -11,10 +15,20 @@ RUN apt update -y && \
&& apt clean -y \
&& rm -rf /var/lib/apt/lists

COPY ./requirements.txt /code/requirements.txt
RUN \
# we use a cache --mount to reuse the uv cache across builds
--mount=type=cache,target=/root/.cache/uv \
# we use a bind --mount to use the uv binary from the uv stage
--mount=type=bind,from=uv,source=/uv,target=/uv \
# we use a bind --mount to use the requirements.txt from the host instead of adding a COPY layer
--mount=type=bind,source=requirements.txt,target=requirements.txt \
/uv venv /opt/venv && \
/uv pip install -r requirements.txt

RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
WORKDIR /app/code/processor

COPY ./src/documentclassification/process/ /code/documentclassification/process/
COPY src/processor/ .
COPY src/configs/processor_config.py configs/processor_config.py
COPY src/payload/processor_models.py payload/processor_models.py

CMD ["uvicorn", "documentclassification.process.process:app", "--host", "0.0.0.0", "--port", "9090"]
CMD ["/opt/venv/bin/python", "-m", "uvicorn", "processor:app", "--host", "0.0.0.0", "--port", "9090"]
50 changes: 49 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,61 @@ services:
ocr:
container_name: ocr_service
image: c0deplayer/dc-ocr:main
# build:
# dockerfile: Dockerfile.ocr
ports:
- "8080:8080"
networks:
- document-classification
volumes:
- type: bind
source: logs
target: /app/data/logs
- type: bind
source: models
target: /app/data/models

processor:
container_name: processor_service
image: c0deplayer/dc-processor:main
# build:
# dockerfile: Dockerfile.processor
ports:
- "9090:9090"
networks:
- document-classification
depends_on:
- ocr
- ocr
volumes:
- type: bind
source: logs
target: /app/data/logs
- type: bind
source: models
target: /app/data/models


predictor:
container_name: predictor_service
image: c0deplayer/dc-predictor:main
# build:
# dockerfile: Dockerfile.predictor
ports:
- "7070:7070"
networks:
- document-classification
depends_on:
- processor
volumes:
- type: bind
source: logs
target: /app/data/logs
- type: bind
source: models
target: /app/data/models



networks:
document-classification:
driver: bridge
Loading

0 comments on commit 8859503

Please sign in to comment.