From 1c8ef88b5f08d79c24f3cec16b65d857de74a605 Mon Sep 17 00:00:00 2001 From: Jakub Zenon Kujawa Date: Wed, 18 Dec 2024 18:57:00 +0100 Subject: [PATCH] feat(encryption): implement AES file encryption and decryption functionality --- Dockerfile.ocr | 1 + pyproject.toml | 1 + src/encryption/aes.py | 136 ++++++++++++++++++++++++++++++++++++++++++ src/ocr/ocr.py | 83 +++++++++++++++++++++++--- uv.lock | 36 ++++++++--- 5 files changed, 243 insertions(+), 14 deletions(-) create mode 100644 src/encryption/aes.py diff --git a/Dockerfile.ocr b/Dockerfile.ocr index a4130c0..48932ae 100644 --- a/Dockerfile.ocr +++ b/Dockerfile.ocr @@ -30,6 +30,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Copy only necessary files COPY src/ocr/ /app/code/ +COPY src/encryption /app/code/encryption/ COPY src/configs/ocr_config.py /app/code/configs/ COPY src/configs/database_config.py /app/code/configs/ COPY src/payload/ocr_models.py /app/code/payload/ diff --git a/pyproject.toml b/pyproject.toml index f24f248..e4ab1aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "pillow>=11.0.0", "psutil>=6.1.0", "psycopg2-binary>=2.9.10", + "pycryptodome>=3.21.0", "pytesseract>=0.3.13", "python-dotenv>=1.0.1", "python-multipart>=0.0.18", diff --git a/src/encryption/aes.py b/src/encryption/aes.py new file mode 100644 index 0000000..9265a90 --- /dev/null +++ b/src/encryption/aes.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import aiofiles +from Crypto.Cipher import AES +from Crypto.Random import get_random_bytes +from Crypto.Util.Padding import pad, unpad + +if TYPE_CHECKING: + from os import PathLike + + +class AESCipher: + """AES Cipher implementation for file encryption and decryption. + + This class provides methods for encrypting and decrypting files using AES in CBC mode. + It handles large files by processing them in chunks to avoid memory issues. + + Attributes: + key (bytes): The encryption/decryption key. + chunk_size (int): Size of chunks to process at a time (default 64KB). + + Example: + >>> cipher = AESCipher(key=b'your-32-byte-key') + >>> await cipher.encrypt_file('input.txt', 'encrypted.bin') + >>> await cipher.decrypt_file('encrypted.bin', 'decrypted.txt') + + Note: + - The key must be a valid AES key size (16, 24, or 32 bytes) + - Files are processed asynchronously using aiofiles + - Uses PKCS7 padding for data that's not a multiple of block size + - IV (Initialization Vector) is automatically generated and prepended to encrypted file + + """ + + def __init__(self, key: bytes) -> None: + """Initialize OCR class with encryption key and chunk size. + + Args: + key (bytes): Encryption key used for securing data. + + Attributes: + key (bytes): Stored encryption key. + chunk_size (int): Size of data chunks for processing, set to 64KB. + + """ + self.key = key + self.chunk_size = 64 * 1024 # 64KB chunks + + async def encrypt_file( + self, + input_file: PathLike, + output_file: PathLike, + ) -> None: + """Encrypts a file using AES encryption in CBC mode. + + The function reads the input file in chunks, encrypts each chunk using AES-CBC, + and writes the encrypted data to the output file. The initialization vector (IV) + is written at the beginning of the output file. + + Args: + input_file (str): Path to the file to be encrypted + output_file (str): Path where the encrypted file will be saved + + Returns: + None + + Raises: + IOError: If there are issues reading input_file or writing to output_file + ValueError: If there are encryption related errors + + Note: + - Uses PKCS7 padding when the final chunk is not a multiple of AES block size + - Processes file in chunks to handle large files efficiently + - IV is randomly generated and prepended to encrypted file + + """ + """Encrypt file in chunks to handle large files.""" + iv = get_random_bytes(AES.block_size) + cipher = AES.new(self.key, AES.MODE_CBC, iv) + + async with aiofiles.open(output_file, "wb") as out_f: + # Write IV first + await out_f.write(iv) + + async with aiofiles.open(input_file, "rb") as in_f: + while chunk := await in_f.read(self.chunk_size): + if len(chunk) % AES.block_size != 0: + chunk = pad(chunk, AES.block_size) + + encrypted_chunk = cipher.encrypt(chunk) + await out_f.write(encrypted_chunk) + + async def decrypt_file( + self, + input_file: PathLike, + output_file: PathLike | None = None, + ) -> None: + """Decrypt file in chunks to handle large files. + + Decrypts a file encrypted using AES-CBC mode, processing in chunks for memory + efficiency. + + Args: + input_file: Path to the encrypted file + output_file: Path for decrypted output (defaults to new temp file) + + Raises: + ValueError: If file is corrupted or key incorrect + IOError: If file read/write errors occur + + """ + if output_file is None: + output_file = input_file.with_suffix( + input_file.suffix + ".decrypted", + ) + + async with aiofiles.open(input_file, "rb") as in_f: + # Read IV first + iv = await in_f.read(AES.block_size) + cipher = AES.new(self.key, AES.MODE_CBC, iv) + + # Read entire file to handle padding properly + ciphertext = await in_f.read() + decrypted = cipher.decrypt(ciphertext) + + try: + decrypted = unpad(decrypted, AES.block_size) + except ValueError as e: + err_msg = "Invalid padding" + logger.exception("Decryption failed - %s", err_msg) + raise ValueError(err_msg) from e + + async with aiofiles.open(output_file, "wb") as out_f: + await out_f.write(decrypted) diff --git a/src/ocr/ocr.py b/src/ocr/ocr.py index cfec679..bc5707f 100644 --- a/src/ocr/ocr.py +++ b/src/ocr/ocr.py @@ -3,12 +3,14 @@ import base64 import io import logging +import os from datetime import datetime from pathlib import Path from typing import TYPE_CHECKING import aiofiles import aiohttp +from dotenv import load_dotenv from fastapi import ( Depends, FastAPI, @@ -28,6 +30,7 @@ DocumentRepository, get_repository, ) +from encryption.aes import AESCipher from utils.utils import get_unique_filename if TYPE_CHECKING: @@ -46,7 +49,7 @@ datefmt="%Y-%m-%d %H:%M:%S", handlers=[ logging.FileHandler( - config.LOG_DIR / f"{datetime.now(tz=datetime.UTC):%Y-%m-%d}.log", + config.LOG_DIR / f"{datetime.now():%Y-%m-%d}.log", mode="a", ), logging.StreamHandler(), @@ -56,11 +59,55 @@ logger = logging.getLogger(__name__) app = FastAPI() -# optimizer = EasyOCRWrapper(config=config) optimizer = TesseractWrapper(config=config) +load_dotenv() +ENCRYPTION_KEY = os.getenv("ENCRYPTION_KEY") +if not ENCRYPTION_KEY: + err_msg = "Missing ENCRYPTION_KEY" + raise ValueError(err_msg) + +KEY = base64.b64decode(ENCRYPTION_KEY) + + +cipher = AESCipher(KEY) + class OCRProcessor: + """OCR Processing Handler. + + This class manages OCR (Optical Character Recognition) processing operations including + file validation, image conversion, and document storage. + + Attributes: + repository (DocumentRepository): Repository for document storage and retrieval. + + Methods: + validate_file(file: UploadFile) -> str: + Validates uploaded file against size, type and other constraints. + Returns a unique filename for the document. + + convert_file_to_images(file: UploadFile) -> list[Image.Image]: + Converts uploaded file (PDF or image) to a list of PIL Image objects. + + convert_to_base64(images: list[Image.Image]) -> list[str]: + Converts PIL Image objects to base64 encoded strings. + + save_document(file_name: str) -> Document: + Saves document metadata to the database. + + Raises: + HTTPException: For various validation failures including: + - Missing filename (400) + - Unsupported file type (415) + - Empty file (400) + - File too large (413) + - PDF page limit exceeded (400) + - Invalid PDF (400) + - Database storage failures (500) + + """ + """Handle OCR processing operations.""" def __init__(self, document_repository: DocumentRepository) -> None: @@ -190,7 +237,7 @@ async def get_docs( documents = await repository.get_all() return [doc.to_dict() for doc in documents] except DocumentError as e: - logger.exception("Failed to retrieve documents: %s", str(e)) + logger.exception("Failed to retrieve documents") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Failed to retrieve documents", @@ -224,6 +271,28 @@ async def process_document( while content := await file.read(1024): # async read chunk await out_file.write(content) # async write chunk + # Initialize encryption for this file + temp_encrypted_file = output_file.with_suffix( + output_file.suffix + ".encrypted", + ) + + # Encrypt the file + await cipher.encrypt_file(output_file, temp_encrypted_file) + + # Replace original with encrypted version + if output_file.exists(): + output_file.unlink() + + Path(temp_encrypted_file).rename(output_file) + + # For debugging purposes decrypt the file + # await cipher.decrypt_file(output_file, tmp_decrypted_file) + + # if output_file.exists(): + # output_file.unlink() + + # Path(tmp_decrypted_file).rename(output_file) + file.file.seek(0) # Reset file pointer images = processor.convert_file_to_images(file) @@ -282,11 +351,11 @@ async def cleanup( if document_id: try: await repository.delete(document_id) - except DocumentError as e: - logger.exception("Failed to delete document: %s", str(e)) + except DocumentError: + logger.exception("Failed to delete document") if output_file and output_file.exists(): try: output_file.unlink() - except OSError as e: - logger.exception("Failed to delete file: %s", str(e)) + except OSError: + logger.exception("Failed to delete file") diff --git a/uv.lock b/uv.lock index c4c048e..671b5d7 100644 --- a/uv.lock +++ b/uv.lock @@ -268,6 +268,7 @@ dependencies = [ { name = "pillow" }, { name = "psutil" }, { name = "psycopg2-binary" }, + { name = "pycryptodome" }, { name = "pytesseract" }, { name = "python-dotenv" }, { name = "python-multipart" }, @@ -293,6 +294,7 @@ requires-dist = [ { name = "pillow", specifier = ">=11.0.0" }, { name = "psutil", specifier = ">=6.1.0" }, { name = "psycopg2-binary", specifier = ">=2.9.10" }, + { name = "pycryptodome", specifier = ">=3.21.0" }, { name = "pytesseract", specifier = ">=0.3.13" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "python-multipart", specifier = ">=0.0.18" }, @@ -886,7 +888,7 @@ name = "nvidia-cudnn-cu12" version = "9.1.0.70" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 }, @@ -897,7 +899,7 @@ name = "nvidia-cufft-cu12" version = "11.2.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 }, @@ -918,9 +920,9 @@ name = "nvidia-cusolver-cu12" version = "11.6.1.9" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, - { name = "nvidia-cusparse-cu12" }, - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 }, @@ -932,7 +934,7 @@ name = "nvidia-cusparse-cu12" version = "12.3.1.170" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 }, @@ -1209,6 +1211,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/19/9ff4551b42f2068686c50c0d199072fa67aee57fc5cf86770cacf71efda3/pyclipper-1.3.0.post6-cp313-cp313-win_amd64.whl", hash = "sha256:e5ff68fa770ac654c7974fc78792978796f068bd274e95930c0691c31e192889", size = 109672 }, ] +[[package]] +name = "pycryptodome" +version = "3.21.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/13/52/13b9db4a913eee948152a079fe58d035bd3d1a519584155da8e786f767e6/pycryptodome-3.21.0.tar.gz", hash = "sha256:f7787e0d469bdae763b876174cf2e6c0f7be79808af26b1da96f1a64bcf47297", size = 4818071 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/88/5e83de10450027c96c79dc65ac45e9d0d7a7fef334f39d3789a191f33602/pycryptodome-3.21.0-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:2480ec2c72438430da9f601ebc12c518c093c13111a5c1644c82cdfc2e50b1e4", size = 2495937 }, + { url = "https://files.pythonhosted.org/packages/66/e1/8f28cd8cf7f7563319819d1e172879ccce2333781ae38da61c28fe22d6ff/pycryptodome-3.21.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:de18954104667f565e2fbb4783b56667f30fb49c4d79b346f52a29cb198d5b6b", size = 1634629 }, + { url = "https://files.pythonhosted.org/packages/6a/c1/f75a1aaff0c20c11df8dc8e2bf8057e7f73296af7dfd8cbb40077d1c930d/pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de4b7263a33947ff440412339cb72b28a5a4c769b5c1ca19e33dd6cd1dcec6e", size = 2168708 }, + { url = "https://files.pythonhosted.org/packages/ea/66/6f2b7ddb457b19f73b82053ecc83ba768680609d56dd457dbc7e902c41aa/pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0714206d467fc911042d01ea3a1847c847bc10884cf674c82e12915cfe1649f8", size = 2254555 }, + { url = "https://files.pythonhosted.org/packages/2c/2b/152c330732a887a86cbf591ed69bd1b489439b5464806adb270f169ec139/pycryptodome-3.21.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d85c1b613121ed3dbaa5a97369b3b757909531a959d229406a75b912dd51dd1", size = 2294143 }, + { url = "https://files.pythonhosted.org/packages/55/92/517c5c498c2980c1b6d6b9965dffbe31f3cd7f20f40d00ec4069559c5902/pycryptodome-3.21.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:8898a66425a57bcf15e25fc19c12490b87bd939800f39a03ea2de2aea5e3611a", size = 2160509 }, + { url = "https://files.pythonhosted.org/packages/39/1f/c74288f54d80a20a78da87df1818c6464ac1041d10988bb7d982c4153fbc/pycryptodome-3.21.0-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:932c905b71a56474bff8a9c014030bc3c882cee696b448af920399f730a650c2", size = 2329480 }, + { url = "https://files.pythonhosted.org/packages/39/1b/d0b013bf7d1af7cf0a6a4fce13f5fe5813ab225313755367b36e714a63f8/pycryptodome-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:18caa8cfbc676eaaf28613637a89980ad2fd96e00c564135bf90bc3f0b34dd93", size = 2254397 }, + { url = "https://files.pythonhosted.org/packages/14/71/4cbd3870d3e926c34706f705d6793159ac49d9a213e3ababcdade5864663/pycryptodome-3.21.0-cp36-abi3-win32.whl", hash = "sha256:280b67d20e33bb63171d55b1067f61fbd932e0b1ad976b3a184303a3dad22764", size = 1775641 }, + { url = "https://files.pythonhosted.org/packages/43/1d/81d59d228381576b92ecede5cd7239762c14001a828bdba30d64896e9778/pycryptodome-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b7aa25fc0baa5b1d95b7633af4f5f1838467f1815442b22487426f94e0d66c53", size = 1812863 }, + { url = "https://files.pythonhosted.org/packages/25/b3/09ff7072e6d96c9939c24cf51d3c389d7c345bf675420355c22402f71b68/pycryptodome-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:2cb635b67011bc147c257e61ce864879ffe6d03342dc74b6045059dfbdedafca", size = 1691593 }, + { url = "https://files.pythonhosted.org/packages/a8/91/38e43628148f68ba9b68dedbc323cf409e537fd11264031961fd7c744034/pycryptodome-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:4c26a2f0dc15f81ea3afa3b0c87b87e501f235d332b7f27e2225ecb80c0b1cdd", size = 1765997 }, +] + [[package]] name = "pydantic" version = "2.10.3" @@ -1840,7 +1862,7 @@ name = "triton" version = "3.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, + { name = "filelock", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux') or (platform_system != 'Darwin' and platform_system != 'Linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/78/eb/65f5ba83c2a123f6498a3097746607e5b2f16add29e36765305e4ac7fdd8/triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc", size = 209551444 },