Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GenAI eatting image from DOCX #1462

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion application/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class Settings(BaseSettings):
LLM_NAME: str = "docsgpt"
MODEL_NAME: Optional[str] = None # if LLM_NAME is openai, MODEL_NAME can be gpt-4 or gpt-3.5-turbo
EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2"
EMBEDDINGS_NAME: str = "openai/clip-vit-base-patch16" #"huggingface_sentence-transformers/all-mpnet-base-v2"
CELERY_BROKER_URL: str = "redis://localhost:6379/0"
CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
MONGO_URI: str = "mongodb://localhost:27017/docsgpt"
Expand Down
66 changes: 32 additions & 34 deletions application/parser/file/bulk.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Simple reader that reads files of different formats from a directory."""

import logging
import sys
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union

Expand All @@ -10,7 +12,7 @@
from application.parser.file.html_parser import HTMLParser
from application.parser.file.markdown_parser import MarkdownParser
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.file.tabular_parser import PandasCSVParser, ExcelParser
from application.parser.file.json_parser import JSONParser
from application.parser.file.pptx_parser import PPTXParser
from application.parser.file.image_parser import ImageParser
Expand All @@ -20,14 +22,14 @@
".pdf": PDFParser(),
".docx": DocxParser(),
".csv": PandasCSVParser(),
".xlsx":ExcelParser(),
".xlsx": ExcelParser(),
".epub": EpubParser(),
".md": MarkdownParser(),
".rst": RstParser(),
".html": HTMLParser(),
".mdx": MarkdownParser(),
".json":JSONParser(),
".pptx":PPTXParser(),
".json": JSONParser(),
".pptx": PPTXParser(),
".png": ImageParser(),
".jpg": ImageParser(),
".jpeg": ImageParser(),
Expand Down Expand Up @@ -61,16 +63,16 @@ class SimpleDirectoryReader(BaseReader):
"""

def __init__(
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = True,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseParser]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
self,
input_dir: Optional[str] = None,
input_files: Optional[List] = None,
exclude_hidden: bool = True,
errors: str = "ignore",
recursive: bool = True,
required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, BaseParser]] = None,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
) -> None:
"""Initialize with parameters."""
super().__init__()
Expand Down Expand Up @@ -110,8 +112,8 @@ def _add_files(self, input_dir: Path) -> List[Path]:
elif self.exclude_hidden and input_file.name.startswith("."):
continue
elif (
self.required_exts is not None
and input_file.suffix not in self.required_exts
self.required_exts is not None
and input_file.suffix not in self.required_exts
):
continue
else:
Expand All @@ -122,7 +124,7 @@ def _add_files(self, input_dir: Path) -> List[Path]:
new_input_files.extend(sub_input_files)

if self.num_files_limit is not None and self.num_files_limit > 0:
new_input_files = new_input_files[0: self.num_files_limit]
new_input_files = new_input_files[0 : self.num_files_limit]

# print total number of files added
logging.debug(
Expand All @@ -146,6 +148,7 @@ def load_data(self, concatenate: bool = False) -> List[Document]:
data: Union[str, List[str]] = ""
data_list: List[str] = []
metadata_list = []
documents: List[Document] = []
for input_file in self.input_files:
if input_file.suffix in self.file_extractor:
parser = self.file_extractor[input_file.suffix]
Expand All @@ -155,29 +158,24 @@ def load_data(self, concatenate: bool = False) -> List[Document]:
else:
# do standard read
with open(input_file, "r", errors=self.errors) as f:
data = f.read()
data = {"text": f.read(), "tables": [], "images": []}
# Prepare metadata for this file
if self.file_metadata is not None:
file_metadata = self.file_metadata(str(input_file))
else:
# Provide a default empty metadata
file_metadata = {'title': '', 'store': ''}
# TODO: Find a case with no metadata and check if breaks anything

if isinstance(data, List):
# Extend data_list with each item in the data list
data_list.extend([str(d) for d in data])
# For each item in the data list, add the file's metadata to metadata_list
metadata_list.extend([file_metadata for _ in data])
else:
# Add the single piece of data to data_list
data_list.append(str(data))
# Add the file's metadata to metadata_list
metadata_list.append(file_metadata)
file_metadata = {"title": "", "store": ""}
# TODO: Find a case with no metadata and check if breaks anything

if concatenate:
return [Document("\n".join(data_list))]
elif self.file_metadata is not None:
return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
else:
return [Document(d) for d in data_list]
# Create separate documents for text, tables, and images
doc = Document(
text=data.get("text", None),
tables=data.get("tables", None),
images=data.get("images", None),
extra_info=file_metadata,
)
documents.append(doc)
return documents
104 changes: 89 additions & 15 deletions application/parser/file/docs_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,23 @@
Contains parsers for docx, pdf files.

"""
from pathlib import Path
from typing import Dict

from pathlib import Path
from typing import Dict, List, Any, Union, Optional
from base64 import b64encode
from application.parser.file.base_parser import BaseParser
from application.core.settings import settings
from docx.enum.shape import WD_INLINE_SHAPE_TYPE
import requests
import logging
import sys
from docx import Document
import base64
from PIL import Image
import io

logger = logging.getLogger(__name__)


class PDFParser(BaseParser):
"""PDF parser."""
Expand All @@ -23,9 +34,9 @@ def parse_file(self, file: Path, errors: str = "ignore") -> str:
doc2md_service = "https://llm.arc53.com/doc2md"
# alternatively you can use local vision capable LLM
with open(file, "rb") as file_loaded:
files = {'file': file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
files = {"file": file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
return data

try:
Expand All @@ -51,19 +62,82 @@ def parse_file(self, file: Path, errors: str = "ignore") -> str:


class DocxParser(BaseParser):
"""Docx parser."""
"""Docx parser to extract text, tables, and images."""

def _init_parser(self) -> Dict:
"""Init parser."""
return {}

def parse_file(self, file: Path, errors: str = "ignore") -> str:
"""Parse file."""
def parse_file(self, file: Path, errors: str = "ignore") -> Dict[str, Any]:
"""Parse file and extract text, tables, and images."""
document = Document(file)
text_content = []
tables = []
images = []

# Extract text from paragraphs
for para in document.paragraphs:
if para.text.strip():
text_content.append(para.text.strip())

# Extract tables
for table in document.tables:
table_data = []
for row in table.rows:
row_data = [cell.text.strip() for cell in row.cells]
table_data.append(row_data)

# Flatten table into a string
flat_table = "\n".join([" | ".join(row) for row in table_data])
tables.append(flat_table)

# Extract images
for shape in document.inline_shapes:
if shape.type == WD_INLINE_SHAPE_TYPE.PICTURE:
image_data = self.extract_image_from_shape(shape, document=document)
if image_data:
images.append(image_data)

return {"text": "\n".join(text_content), "tables": tables, "images": images}

def extract_image_from_shape(self, shape, document) -> Optional[Dict[str, str]]:
"""Extract image from an inline shape and encode it in Base64."""
try:
import docx2txt
except ImportError:
raise ValueError("docx2txt is required to read Microsoft Word files.")

text = docx2txt.process(file)

return text
rId = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
image_part = document.part.related_parts[rId]
image_data = image_part.blob
image_filename = image_part.filename or f"image_{rId}.png"

# Validate the image
if not self.is_valid_image(image_data):
print(f"Invalid image: {image_filename}")
return None
resized_image = self.resize_image(image_data)
image_base64 = base64.b64encode(resized_image).decode("utf-8")
return {"filename": image_filename, "image_base64": image_base64}
except Exception as e:
print(f"Error extracting image: {e}")
return None

def is_valid_image(self, image_data: bytes) -> bool:
"""Validate image data."""
try:
image = Image.open(io.BytesIO(image_data))
image.verify()
return True
except Exception:
return False

def resize_image(self, image_data: bytes, width=200) -> bytes:
"""Resize image to a specified maximum width, maintaining aspect ratio."""
image = Image.open(io.BytesIO(image_data))
ratio = width / float(image.width)
height = int(image.height * ratio)
img = image.resize((width, height), Image.Resampling.LANCZOS)

# Save the resized image back to bytes
output = io.BytesIO()
# Use original format if possible, else default to PNG
img_format = image.format if image.format else "PNG"
img.save(output, format=img_format)
return output.getvalue()
10 changes: 6 additions & 4 deletions application/parser/file/image_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
Contains parser for .png, .jpg, .jpeg files.

"""

from pathlib import Path
import requests
import base64
from typing import Dict, Union

from urllib.parse import urlparse
from application.parser.file.base_parser import BaseParser


Expand All @@ -21,7 +23,7 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]
doc2md_service = "https://llm.arc53.com/doc2md"
# alternatively you can use local vision capable LLM
with open(file, "rb") as file_loaded:
files = {'file': file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
files = {"file": file_loaded}
response = requests.post(doc2md_service, files=files)
data = response.json()["markdown"]
return data
Loading
Loading