feat!: add LayoutLMv3 model and restructure project architecture

Major changes: - Integrate LayoutLMv3 model for document classification - Add new predictor container for model inference - Create dedicated data models and configurations using dataclasses - Implement proper model versioning and persistence - Optimize Docker builds with UV package manager - Set up volume bindings for models and logs persistence - Reorganize code for better maintainability and testing - Add proper error handling and logging - Implement state management for processing pipeline - Add proper type hints and documentation Infrastructure improvements: - Replace pip with UV for faster package installation - Add bind mounts for logs and model artifacts - Implement multi-stage Docker builds - Configure proper networking between services - Set up development environment with PDM - Add start_services.sh script for easy deployment and initialization BREAKING CHANGE: Complete architecture redesign with new model integration and container structure.
c0deplayer · Nov 17, 2024 · 8859503 · 8859503
1 parent 09e1066
commit 8859503
Show file tree

Hide file tree

Showing 24 changed files with 1,804 additions and 817 deletions.
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -18,6 +18,8 @@ jobs:
             dockerfile: ./Dockerfile.ocr
           - image: c0deplayer/dc-processor
             dockerfile: ./Dockerfile.processor
+          - image: c0deplayer/dc-predictor
+            dockerfile: ./Dockerfile.predictor
 
     steps:
       - name: Checkout

diff --git a/.gitignore b/.gitignore
@@ -162,5 +162,7 @@ cython_debug/
 #.idea/
 
 # Other
+.DS_Store
 logs/
-__*.py
+__*.py
+models/
diff --git a/Dockerfile.ocr b/Dockerfile.ocr
@@ -1,20 +1,36 @@
-FROM python:3.12-slim
+FROM ghcr.io/astral-sh/uv:latest AS uv
+FROM python:3.12-slim AS python
+
 LABEL authors="codeplayer"
 
-WORKDIR /code
+ENV VIRTUAL_ENV=/opt/venv
+
+WORKDIR /app/data/ocr
 
 # Update and upgrade the system
 RUN apt update -y && \
   apt upgrade -y \
+  # Install required packages
+  && apt install poppler-utils -y \
   # cleanup
   && apt autoremove -y \
   && apt clean -y \
   && rm -rf /var/lib/apt/lists
 
-COPY ./requirements.txt /code/requirements.txt
+RUN  \
+  # we use a cache --mount to reuse the uv cache across builds
+  --mount=type=cache,target=/root/.cache/uv \
+  # we use a bind --mount to use the uv binary from the uv stage
+  --mount=type=bind,from=uv,source=/uv,target=/uv \
+  # we use a bind --mount to use the requirements.txt from the host instead of adding a COPY layer
+  --mount=type=bind,source=requirements.txt,target=requirements.txt \
+  /uv venv /opt/venv && \
+  /uv pip install -r requirements.txt
 
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+WORKDIR /app/code/ocr
 
-COPY ./src/documentclassification/ocr/ /code/documentclassification/ocr/
+COPY src/ocr/ .
+COPY src/configs/ocr_config.py configs/ocr_config.py
+COPY src/payload/ocr_models.py payload/ocr_models.py
 
-CMD ["uvicorn", "documentclassification.ocr.ocr:app", "--host", "0.0.0.0", "--port", "8080"]
+CMD ["/opt/venv/bin/python", "-m", "uvicorn", "ocr:app", "--host", "0.0.0.0", "--port", "8080"]
diff --git a/Dockerfile.predictor b/Dockerfile.predictor
@@ -0,0 +1,32 @@
+FROM ghcr.io/astral-sh/uv:latest AS uv
+FROM python:3.12-slim AS python
+
+ENV VIRTUAL_ENV=/opt/venv
+
+WORKDIR /app/data/predictor
+
+# Update and upgrade the system
+RUN apt update -y && \
+  apt upgrade -y \
+  # cleanup
+  && apt autoremove -y \
+  && apt clean -y \
+  && rm -rf /var/lib/apt/lists
+
+RUN  \
+  # we use a cache --mount to reuse the uv cache across builds
+  --mount=type=cache,target=/root/.cache/uv \
+  # we use a bind --mount to use the uv binary from the uv stage
+  --mount=type=bind,from=uv,source=/uv,target=/uv \
+  # we use a bind --mount to use the requirements.txt from the host instead of adding a COPY layer
+  --mount=type=bind,source=requirements.txt,target=requirements.txt \
+  /uv venv /opt/venv && \
+  /uv pip install -r requirements.txt
+
+WORKDIR /app/code/predictor
+
+COPY src/predictor/ .
+COPY src/configs/model_config.py configs/model_config.py
+COPY src/payload/model_models.py payload/model_models.py
+
+CMD ["/opt/venv/bin/python","-m", "uvicorn", "model:app", "--host", "0.0.0.0", "--port", "7070"]
diff --git a/Dockerfile.processor b/Dockerfile.processor
@@ -1,7 +1,11 @@
-FROM python:3.12-slim
+FROM ghcr.io/astral-sh/uv:latest AS uv
+FROM python:3.12-slim AS python
+
 LABEL authors="codeplayer"
 
-WORKDIR /code
+ENV VIRTUAL_ENV=/opt/venv
+
+# RUN mkdir -p /app/data/logs/processor
 
 # Update and upgrade the system
 RUN apt update -y && \
@@ -11,10 +15,20 @@ RUN apt update -y && \
   && apt clean -y \
   && rm -rf /var/lib/apt/lists
 
-COPY ./requirements.txt /code/requirements.txt
+RUN  \
+  # we use a cache --mount to reuse the uv cache across builds
+  --mount=type=cache,target=/root/.cache/uv \
+  # we use a bind --mount to use the uv binary from the uv stage
+  --mount=type=bind,from=uv,source=/uv,target=/uv \
+  # we use a bind --mount to use the requirements.txt from the host instead of adding a COPY layer
+  --mount=type=bind,source=requirements.txt,target=requirements.txt \
+  /uv venv /opt/venv && \
+  /uv pip install -r requirements.txt
 
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+WORKDIR /app/code/processor
 
-COPY ./src/documentclassification/process/ /code/documentclassification/process/
+COPY src/processor/ .
+COPY src/configs/processor_config.py configs/processor_config.py
+COPY src/payload/processor_models.py payload/processor_models.py
 
-CMD ["uvicorn", "documentclassification.process.process:app", "--host", "0.0.0.0", "--port", "9090"]
+CMD ["/opt/venv/bin/python", "-m", "uvicorn", "processor:app", "--host", "0.0.0.0", "--port", "9090"]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -2,13 +2,61 @@ services:
   ocr:
     container_name: ocr_service
     image: c0deplayer/dc-ocr:main
+    # build:
+    #   dockerfile: Dockerfile.ocr
     ports:
       - "8080:8080"
+    networks:
+      - document-classification
+    volumes:
+      - type: bind
+        source: logs
+        target: /app/data/logs
+      - type: bind
+        source: models
+        target: /app/data/models
 
   processor:
     container_name: processor_service
     image: c0deplayer/dc-processor:main
+    # build:
+    #   dockerfile: Dockerfile.processor
     ports:
       - "9090:9090"
+    networks:
+      - document-classification
     depends_on:
-      - ocr
+      - ocr
+    volumes:
+      - type: bind
+        source: logs
+        target: /app/data/logs
+      - type: bind
+        source: models
+        target: /app/data/models
+
+
+  predictor:
+    container_name: predictor_service
+    image: c0deplayer/dc-predictor:main
+    # build:
+    #   dockerfile: Dockerfile.predictor
+    ports:
+      - "7070:7070"
+    networks:
+      - document-classification
+    depends_on:
+      - processor
+    volumes:
+      - type: bind
+        source: logs
+        target: /app/data/logs
+      - type: bind
+        source: models
+        target: /app/data/models
+
+
+
+networks:
+  document-classification:
+    driver: bridge
-Original file line number
+Diff line change
@@ Expand Up / @@ -162,5 +162,7 @@ cython_debug/ @@
     #.idea/
     # Other
+    .DS_Store
     logs/
-    __*.py
+    __*.py
+    models/