diff --git a/src/configs/ocr_config.py b/src/configs/ocr_config.py index 1aca5b3..636eb25 100644 --- a/src/configs/ocr_config.py +++ b/src/configs/ocr_config.py @@ -11,7 +11,7 @@ class OCRConfig: FILE_SIZE_LIMIT: int = 20 * BYTE_MEGABYTE MAX_PDF_PAGES: int = 15 - TARGET_SIZE: int = 1028 + TARGET_SIZE: int = 1240 MAX_WORKERS: int = 4 LOG_DIR: Path = Path("/app/data/logs/ocr") ACCEPTED_FILE_TYPES: Set[str] = frozenset( diff --git a/src/ocr/ocr.py b/src/ocr/ocr.py index 20056eb..9935548 100644 --- a/src/ocr/ocr.py +++ b/src/ocr/ocr.py @@ -6,10 +6,10 @@ import requests from fastapi import Depends, FastAPI, HTTPException, UploadFile, status -from optimized_ocr import OptimizedOCR from pdf2image import convert_from_bytes from pdf2image.exceptions import PDFPageCountError from PIL import Image +from tesseract import TesseractOCR from configs.ocr_config import OCRConfig from database import DocumentCreate, DocumentError, DocumentRepository, get_repository @@ -33,7 +33,8 @@ logger = logging.getLogger(__name__) app = FastAPI() -optimizer = OptimizedOCR(config=config) +# optimizer = OptimizedOCR(config=config) +optimizer = TesseractOCR(config=config) class OCRProcessor: diff --git a/src/ocr/optimized_ocr.py b/src/ocr/optimized_ocr.py index c7c1ef8..757b3f1 100644 --- a/src/ocr/optimized_ocr.py +++ b/src/ocr/optimized_ocr.py @@ -58,7 +58,7 @@ def _initialize_reader(self) -> easyocr.Reader: """Initialize EasyOCR with optimized settings.""" return easyocr.Reader( ["en", "pl"], - gpu=False, + gpu=True, model_storage_directory="/app/data/models", download_enabled=True, quantize=True, diff --git a/src/ocr/tesseract.py b/src/ocr/tesseract.py new file mode 100644 index 0000000..35ca7fd --- /dev/null +++ b/src/ocr/tesseract.py @@ -0,0 +1,146 @@ +import logging + +import cv2 +import numpy as np +import pytesseract +from numpy.typing import NDArray +from PIL import Image + +from configs.ocr_config import OCRConfig +from payload.ocr_models import OCRResponse, OCRResult + + +class ImagePreprocessor: + """Handle image preprocessing operations.""" + + def __init__(self, target_size: int) -> None: + self.target_size = target_size + self._clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) + + def resize_image(self, image: Image.Image) -> Image.Image: + """Resize image while maintaining aspect ratio.""" + if self.target_size == 0 or max(image.size) <= self.target_size: + return image + + width, height = image.size + scale = min(self.target_size / width, self.target_size / height) + new_size = tuple(int(dim * scale) for dim in (width, height)) + + return image.resize(new_size, resample=Image.Resampling.LANCZOS) + + def enhance_image(self, image: NDArray[np.uint8]) -> NDArray[np.uint8]: + """Apply image enhancement techniques.""" + # Convert to grayscale if not already + if len(image.shape) == 3: + image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) + + # Apply CLAHE for contrast enhancement + enhanced = self._clahe.apply(image) + + # Denoise + denoised = cv2.fastNlMeansDenoising(enhanced) + + # Binarization using Otsu's method + _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + + return binary + + def preprocess(self, image: Image.Image) -> NDArray[np.uint8]: + """Complete image preprocessing pipeline.""" + # Resize image + image = self.resize_image(image) + + # Convert to numpy array + image_array = np.array(image) + + # Enhance image + return self.enhance_image(image_array) + + +class TesseractOCR: + """OCR processing with Tesseract.""" + + def __init__(self, config: OCRConfig) -> None: + """Initialize OCR processor with configuration.""" + self.config = config + self.preprocessor = ImagePreprocessor(config.TARGET_SIZE) + + # Configure Tesseract parameters + self.custom_config = r"--oem 3 --psm 11" + + @staticmethod + def create_bounding_box( + bbox_data: list[tuple[float, float]] | tuple[int, int, int, int], + ) -> list[int]: + """Create standardized bounding box from coordinates. + + Args: + bbox_data: Either a list of (x,y) coordinate tuples or a tuple of (x, y, width, height) + + Returns: + list[int]: Standardized bounding box [x_min, y_min, x_max, y_max] + """ + if isinstance(bbox_data, tuple) and len(bbox_data) == 4: + # Handle Tesseract format (x, y, width, height) + x, y, w, h = bbox_data + return [int(x), int(y), int(x + w), int(y + h)] + else: + # Handle original format (list of coordinate tuples) + xs, ys = zip(*bbox_data) + return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))] + + def _process_single(self, image: NDArray[np.uint8]) -> list[OCRResult]: + """Process a single image with Tesseract.""" + try: + results = pytesseract.image_to_data( + image, output_type=pytesseract.Output.DICT, config=self.custom_config + ) + + ocr_results = [] + n_boxes = len(results["text"]) + + for i in range(n_boxes): + if int(results["conf"][i]) < 0 or not results["text"][i].strip(): + continue + + # Create bbox_data tuple for compatibility + bbox = self.create_bounding_box( + ( + results["left"][i], + results["top"][i], + results["width"][i], + results["height"][i], + ) + ) + + ocr_results.append( + OCRResult(bounding_box=bbox, word=results["text"][i].strip()) + ) + + return ocr_results + + except Exception as e: + logging.error("Error processing image: %s", str(e)) + return [] + + def process_batch( + self, images: list[Image.Image] + ) -> tuple[OCRResponse, list[Image.Image]]: + """Process multiple images in optimized batch.""" + # Preprocess all images + processed = [self.preprocessor.preprocess(image) for image in images] + resized = [self.preprocessor.resize_image(image) for image in images] + + # Process each image + results = [self._process_single(image) for image in processed] + + # # Validate results before returning + # for page_results in results: + # for result in page_results: + # bbox = result.bounding_box + # if not all(0 <= coord <= 1000 for coord in bbox): + # logging.warning(f"Invalid bbox coordinates detected: {bbox}") + + response = OCRResponse(results=results, page_count=len(results)) + + return response, resized diff --git a/src/processor/dataset.py b/src/processor/dataset.py index f2f1f82..423599a 100644 --- a/src/processor/dataset.py +++ b/src/processor/dataset.py @@ -153,8 +153,11 @@ def _decode_images(encoded_images: list[str]) -> list[Image.Image]: def _calculate_scales(self, image_size: tuple[int, int]) -> tuple[float, float]: """Calculate scaling factors for image resizing.""" width, height = image_size + logger.debug("Image size: %s", image_size) width_scale = self.config.IMAGE_TARGET_SIZE / width height_scale = self.config.IMAGE_TARGET_SIZE / height + logger.debug("Scaling factors: %s", (width_scale, height_scale)) + return width_scale, height_scale @staticmethod