Skip to content

Commit

Permalink
feat(encryption): implement AES file encryption and decryption functi…
Browse files Browse the repository at this point in the history
…onality
  • Loading branch information
c0deplayer committed Dec 18, 2024
1 parent 192d38c commit 1c8ef88
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 14 deletions.
1 change: 1 addition & 0 deletions Dockerfile.ocr
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \

# Copy only necessary files
COPY src/ocr/ /app/code/
COPY src/encryption /app/code/encryption/
COPY src/configs/ocr_config.py /app/code/configs/
COPY src/configs/database_config.py /app/code/configs/
COPY src/payload/ocr_models.py /app/code/payload/
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = [
"pillow>=11.0.0",
"psutil>=6.1.0",
"psycopg2-binary>=2.9.10",
"pycryptodome>=3.21.0",
"pytesseract>=0.3.13",
"python-dotenv>=1.0.1",
"python-multipart>=0.0.18",
Expand Down
136 changes: 136 additions & 0 deletions src/encryption/aes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import aiofiles
from Crypto.Cipher import AES
from Crypto.Random import get_random_bytes
from Crypto.Util.Padding import pad, unpad

if TYPE_CHECKING:
from os import PathLike


class AESCipher:
"""AES Cipher implementation for file encryption and decryption.
This class provides methods for encrypting and decrypting files using AES in CBC mode.
It handles large files by processing them in chunks to avoid memory issues.
Attributes:
key (bytes): The encryption/decryption key.
chunk_size (int): Size of chunks to process at a time (default 64KB).
Example:
>>> cipher = AESCipher(key=b'your-32-byte-key')
>>> await cipher.encrypt_file('input.txt', 'encrypted.bin')
>>> await cipher.decrypt_file('encrypted.bin', 'decrypted.txt')
Note:
- The key must be a valid AES key size (16, 24, or 32 bytes)
- Files are processed asynchronously using aiofiles
- Uses PKCS7 padding for data that's not a multiple of block size
- IV (Initialization Vector) is automatically generated and prepended to encrypted file
"""

def __init__(self, key: bytes) -> None:
"""Initialize OCR class with encryption key and chunk size.
Args:
key (bytes): Encryption key used for securing data.
Attributes:
key (bytes): Stored encryption key.
chunk_size (int): Size of data chunks for processing, set to 64KB.
"""
self.key = key
self.chunk_size = 64 * 1024 # 64KB chunks

async def encrypt_file(
self,
input_file: PathLike,
output_file: PathLike,
) -> None:
"""Encrypts a file using AES encryption in CBC mode.
The function reads the input file in chunks, encrypts each chunk using AES-CBC,
and writes the encrypted data to the output file. The initialization vector (IV)
is written at the beginning of the output file.
Args:
input_file (str): Path to the file to be encrypted
output_file (str): Path where the encrypted file will be saved
Returns:
None
Raises:
IOError: If there are issues reading input_file or writing to output_file
ValueError: If there are encryption related errors
Note:
- Uses PKCS7 padding when the final chunk is not a multiple of AES block size
- Processes file in chunks to handle large files efficiently
- IV is randomly generated and prepended to encrypted file
"""
"""Encrypt file in chunks to handle large files."""
iv = get_random_bytes(AES.block_size)
cipher = AES.new(self.key, AES.MODE_CBC, iv)

async with aiofiles.open(output_file, "wb") as out_f:
# Write IV first
await out_f.write(iv)

async with aiofiles.open(input_file, "rb") as in_f:
while chunk := await in_f.read(self.chunk_size):
if len(chunk) % AES.block_size != 0:
chunk = pad(chunk, AES.block_size)

encrypted_chunk = cipher.encrypt(chunk)
await out_f.write(encrypted_chunk)

async def decrypt_file(
self,
input_file: PathLike,
output_file: PathLike | None = None,
) -> None:
"""Decrypt file in chunks to handle large files.
Decrypts a file encrypted using AES-CBC mode, processing in chunks for memory
efficiency.
Args:
input_file: Path to the encrypted file
output_file: Path for decrypted output (defaults to new temp file)
Raises:
ValueError: If file is corrupted or key incorrect
IOError: If file read/write errors occur
"""
if output_file is None:
output_file = input_file.with_suffix(
input_file.suffix + ".decrypted",
)

async with aiofiles.open(input_file, "rb") as in_f:
# Read IV first
iv = await in_f.read(AES.block_size)
cipher = AES.new(self.key, AES.MODE_CBC, iv)

# Read entire file to handle padding properly
ciphertext = await in_f.read()
decrypted = cipher.decrypt(ciphertext)

try:
decrypted = unpad(decrypted, AES.block_size)
except ValueError as e:
err_msg = "Invalid padding"
logger.exception("Decryption failed - %s", err_msg)
raise ValueError(err_msg) from e

async with aiofiles.open(output_file, "wb") as out_f:
await out_f.write(decrypted)
83 changes: 76 additions & 7 deletions src/ocr/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
import base64
import io
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING

import aiofiles
import aiohttp
from dotenv import load_dotenv
from fastapi import (
Depends,
FastAPI,
Expand All @@ -28,6 +30,7 @@
DocumentRepository,
get_repository,
)
from encryption.aes import AESCipher
from utils.utils import get_unique_filename

if TYPE_CHECKING:
Expand All @@ -46,7 +49,7 @@
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[
logging.FileHandler(
config.LOG_DIR / f"{datetime.now(tz=datetime.UTC):%Y-%m-%d}.log",
config.LOG_DIR / f"{datetime.now():%Y-%m-%d}.log",
mode="a",
),
logging.StreamHandler(),
Expand All @@ -56,11 +59,55 @@
logger = logging.getLogger(__name__)

app = FastAPI()
# optimizer = EasyOCRWrapper(config=config)
optimizer = TesseractWrapper(config=config)

load_dotenv()
ENCRYPTION_KEY = os.getenv("ENCRYPTION_KEY")
if not ENCRYPTION_KEY:
err_msg = "Missing ENCRYPTION_KEY"
raise ValueError(err_msg)

KEY = base64.b64decode(ENCRYPTION_KEY)


cipher = AESCipher(KEY)


class OCRProcessor:
"""OCR Processing Handler.
This class manages OCR (Optical Character Recognition) processing operations including
file validation, image conversion, and document storage.
Attributes:
repository (DocumentRepository): Repository for document storage and retrieval.
Methods:
validate_file(file: UploadFile) -> str:
Validates uploaded file against size, type and other constraints.
Returns a unique filename for the document.
convert_file_to_images(file: UploadFile) -> list[Image.Image]:
Converts uploaded file (PDF or image) to a list of PIL Image objects.
convert_to_base64(images: list[Image.Image]) -> list[str]:
Converts PIL Image objects to base64 encoded strings.
save_document(file_name: str) -> Document:
Saves document metadata to the database.
Raises:
HTTPException: For various validation failures including:
- Missing filename (400)
- Unsupported file type (415)
- Empty file (400)
- File too large (413)
- PDF page limit exceeded (400)
- Invalid PDF (400)
- Database storage failures (500)
"""

"""Handle OCR processing operations."""

def __init__(self, document_repository: DocumentRepository) -> None:
Expand Down Expand Up @@ -190,7 +237,7 @@ async def get_docs(
documents = await repository.get_all()
return [doc.to_dict() for doc in documents]
except DocumentError as e:
logger.exception("Failed to retrieve documents: %s", str(e))
logger.exception("Failed to retrieve documents")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to retrieve documents",
Expand Down Expand Up @@ -224,6 +271,28 @@ async def process_document(
while content := await file.read(1024): # async read chunk
await out_file.write(content) # async write chunk

# Initialize encryption for this file
temp_encrypted_file = output_file.with_suffix(
output_file.suffix + ".encrypted",
)

# Encrypt the file
await cipher.encrypt_file(output_file, temp_encrypted_file)

# Replace original with encrypted version
if output_file.exists():
output_file.unlink()

Path(temp_encrypted_file).rename(output_file)

# For debugging purposes decrypt the file
# await cipher.decrypt_file(output_file, tmp_decrypted_file)

# if output_file.exists():
# output_file.unlink()

# Path(tmp_decrypted_file).rename(output_file)

file.file.seek(0) # Reset file pointer

images = processor.convert_file_to_images(file)
Expand Down Expand Up @@ -282,11 +351,11 @@ async def cleanup(
if document_id:
try:
await repository.delete(document_id)
except DocumentError as e:
logger.exception("Failed to delete document: %s", str(e))
except DocumentError:
logger.exception("Failed to delete document")

if output_file and output_file.exists():
try:
output_file.unlink()
except OSError as e:
logger.exception("Failed to delete file: %s", str(e))
except OSError:
logger.exception("Failed to delete file")
36 changes: 29 additions & 7 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1c8ef88

Please sign in to comment.