arc53 · fadingNA · Nov 28, 2024 · Nov 30, 2024 · Dec 5, 2024 · Dec 6, 2024
diff --git a/application/core/settings.py b/application/core/settings.py
@@ -10,7 +10,7 @@
 class Settings(BaseSettings):
     LLM_NAME: str = "docsgpt"
     MODEL_NAME: Optional[str] = None # if LLM_NAME is openai, MODEL_NAME can be gpt-4 or gpt-3.5-turbo
-    EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2"
+    EMBEDDINGS_NAME: str =  "openai/clip-vit-base-patch16"   #"huggingface_sentence-transformers/all-mpnet-base-v2"
     CELERY_BROKER_URL: str = "redis://localhost:6379/0"
     CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
     MONGO_URI: str = "mongodb://localhost:27017/docsgpt"

diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py
@@ -1,5 +1,7 @@
 """Simple reader that reads files of different formats from a directory."""
+
 import logging
+import sys
 from pathlib import Path
 from typing import Callable, Dict, List, Optional, Union
 
@@ -10,7 +12,7 @@
 from application.parser.file.html_parser import HTMLParser
 from application.parser.file.markdown_parser import MarkdownParser
 from application.parser.file.rst_parser import RstParser
-from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
+from application.parser.file.tabular_parser import PandasCSVParser, ExcelParser
 from application.parser.file.json_parser import JSONParser
 from application.parser.file.pptx_parser import PPTXParser
 from application.parser.file.image_parser import ImageParser
@@ -20,14 +22,14 @@
     ".pdf": PDFParser(),
     ".docx": DocxParser(),
     ".csv": PandasCSVParser(),
-    ".xlsx":ExcelParser(),
+    ".xlsx": ExcelParser(),
     ".epub": EpubParser(),
     ".md": MarkdownParser(),
     ".rst": RstParser(),
     ".html": HTMLParser(),
     ".mdx": MarkdownParser(),
-    ".json":JSONParser(),
-    ".pptx":PPTXParser(),
+    ".json": JSONParser(),
+    ".pptx": PPTXParser(),
     ".png": ImageParser(),
     ".jpg": ImageParser(),
     ".jpeg": ImageParser(),
@@ -61,16 +63,16 @@ class SimpleDirectoryReader(BaseReader):
     """
 
     def __init__(
-            self,
-            input_dir: Optional[str] = None,
-            input_files: Optional[List] = None,
-            exclude_hidden: bool = True,
-            errors: str = "ignore",
-            recursive: bool = True,
-            required_exts: Optional[List[str]] = None,
-            file_extractor: Optional[Dict[str, BaseParser]] = None,
-            num_files_limit: Optional[int] = None,
-            file_metadata: Optional[Callable[[str], Dict]] = None,
+        self,
+        input_dir: Optional[str] = None,
+        input_files: Optional[List] = None,
+        exclude_hidden: bool = True,
+        errors: str = "ignore",
+        recursive: bool = True,
+        required_exts: Optional[List[str]] = None,
+        file_extractor: Optional[Dict[str, BaseParser]] = None,
+        num_files_limit: Optional[int] = None,
+        file_metadata: Optional[Callable[[str], Dict]] = None,
     ) -> None:
         """Initialize with parameters."""
         super().__init__()
@@ -110,8 +112,8 @@ def _add_files(self, input_dir: Path) -> List[Path]:
             elif self.exclude_hidden and input_file.name.startswith("."):
                 continue
             elif (
-                    self.required_exts is not None
-                    and input_file.suffix not in self.required_exts
+                self.required_exts is not None
+                and input_file.suffix not in self.required_exts
             ):
                 continue
             else:
@@ -122,7 +124,7 @@ def _add_files(self, input_dir: Path) -> List[Path]:
             new_input_files.extend(sub_input_files)
 
         if self.num_files_limit is not None and self.num_files_limit > 0:
-            new_input_files = new_input_files[0: self.num_files_limit]
+            new_input_files = new_input_files[0 : self.num_files_limit]
 
         # print total number of files added
         logging.debug(
@@ -146,6 +148,7 @@ def load_data(self, concatenate: bool = False) -> List[Document]:
         data: Union[str, List[str]] = ""
         data_list: List[str] = []
         metadata_list = []
+        documents: List[Document] = []
         for input_file in self.input_files:
             if input_file.suffix in self.file_extractor:
                 parser = self.file_extractor[input_file.suffix]
@@ -155,29 +158,24 @@ def load_data(self, concatenate: bool = False) -> List[Document]:
             else:
                 # do standard read
                 with open(input_file, "r", errors=self.errors) as f:
-                    data = f.read()
+                    data = {"text": f.read(), "tables": [], "images": []}
             # Prepare metadata for this file
             if self.file_metadata is not None:
                 file_metadata = self.file_metadata(str(input_file))
             else:
                 # Provide a default empty metadata
-                file_metadata = {'title': '', 'store': ''}
-                # TODO: Find a case with no metadata and check if breaks anything 
-
-            if isinstance(data, List):
-                # Extend data_list with each item in the data list
-                data_list.extend([str(d) for d in data])
-                # For each item in the data list, add the file's metadata to metadata_list
-                metadata_list.extend([file_metadata for _ in data])
-            else:
-                # Add the single piece of data to data_list
-                data_list.append(str(data))
-                # Add the file's metadata to metadata_list
-                metadata_list.append(file_metadata)
+                file_metadata = {"title": "", "store": ""}
+                # TODO: Find a case with no metadata and check if breaks anything
 
         if concatenate:
             return [Document("\n".join(data_list))]
-        elif self.file_metadata is not None:
-            return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)]
         else:
-            return [Document(d) for d in data_list]
+            # Create separate documents for text, tables, and images
+            doc = Document(
+                text=data.get("text", None),
+                tables=data.get("tables", None),
+                images=data.get("images", None),
+                extra_info=file_metadata,
+            )
+            documents.append(doc)
+            return documents
diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py
@@ -3,12 +3,23 @@
 Contains parsers for docx, pdf files.
 
 """
-from pathlib import Path
-from typing import Dict
 
+from pathlib import Path
+from typing import Dict, List, Any, Union, Optional
+from base64 import b64encode
 from application.parser.file.base_parser import BaseParser
 from application.core.settings import settings
+from docx.enum.shape import WD_INLINE_SHAPE_TYPE
 import requests
+import logging
+import sys
+from docx import Document
+import base64
+from PIL import Image
+import io
+
+logger = logging.getLogger(__name__)
+
 
 class PDFParser(BaseParser):
     """PDF parser."""
@@ -23,9 +34,9 @@ def parse_file(self, file: Path, errors: str = "ignore") -> str:
             doc2md_service = "https://llm.arc53.com/doc2md"
             # alternatively you can use local vision capable LLM
             with open(file, "rb") as file_loaded:
-                files = {'file': file_loaded}
-                response = requests.post(doc2md_service, files=files)   
-                data = response.json()["markdown"] 
+                files = {"file": file_loaded}
+                response = requests.post(doc2md_service, files=files)
+                data = response.json()["markdown"]
             return data
 
         try:
@@ -51,19 +62,82 @@ def parse_file(self, file: Path, errors: str = "ignore") -> str:
 
 
 class DocxParser(BaseParser):
-    """Docx parser."""
+    """Docx parser to extract text, tables, and images."""
 
     def _init_parser(self) -> Dict:
         """Init parser."""
         return {}
 
-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
+    def parse_file(self, file: Path, errors: str = "ignore") -> Dict[str, Any]:
+        """Parse file and extract text, tables, and images."""
+        document = Document(file)
+        text_content = []
+        tables = []
+        images = []
+
+        # Extract text from paragraphs
+        for para in document.paragraphs:
+            if para.text.strip():
+                text_content.append(para.text.strip())
+
+        # Extract tables
+        for table in document.tables:
+            table_data = []
+            for row in table.rows:
+                row_data = [cell.text.strip() for cell in row.cells]
+                table_data.append(row_data)
+
+            # Flatten table into a string
+            flat_table = "\n".join([" | ".join(row) for row in table_data])
+            tables.append(flat_table)
+
+        # Extract images
+        for shape in document.inline_shapes:
+            if shape.type == WD_INLINE_SHAPE_TYPE.PICTURE:
+                image_data = self.extract_image_from_shape(shape, document=document)
+                if image_data:
+                    images.append(image_data)
+
+        return {"text": "\n".join(text_content), "tables": tables, "images": images}
+
+    def extract_image_from_shape(self, shape, document) -> Optional[Dict[str, str]]:
+        """Extract image from an inline shape and encode it in Base64."""
         try:
-            import docx2txt
-        except ImportError:
-            raise ValueError("docx2txt is required to read Microsoft Word files.")
-
-        text = docx2txt.process(file)
-
-        return text
+            rId = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
+            image_part = document.part.related_parts[rId]
+            image_data = image_part.blob
+            image_filename = image_part.filename or f"image_{rId}.png"
+
+            # Validate the image
+            if not self.is_valid_image(image_data):
+                print(f"Invalid image: {image_filename}")
+                return None
+            resized_image = self.resize_image(image_data)
+            image_base64 = base64.b64encode(resized_image).decode("utf-8")
+            return {"filename": image_filename, "image_base64": image_base64}
+        except Exception as e:
+            print(f"Error extracting image: {e}")
+            return None
+
+    def is_valid_image(self, image_data: bytes) -> bool:
+        """Validate image data."""
+        try:
+            image = Image.open(io.BytesIO(image_data))
+            image.verify()
+            return True
+        except Exception:
+            return False
+
+    def resize_image(self, image_data: bytes, width=200) -> bytes:
+        """Resize image to a specified maximum width, maintaining aspect ratio."""
+        image = Image.open(io.BytesIO(image_data))
+        ratio = width / float(image.width)
+        height = int(image.height * ratio)
+        img = image.resize((width, height), Image.Resampling.LANCZOS)
+
+        # Save the resized image back to bytes
+        output = io.BytesIO()
+        # Use original format if possible, else default to PNG
+        img_format = image.format if image.format else "PNG"
+        img.save(output, format=img_format)
+        return output.getvalue()
diff --git a/application/parser/file/image_parser.py b/application/parser/file/image_parser.py
@@ -3,10 +3,12 @@
 Contains parser for .png, .jpg, .jpeg files.
 
 """
+
 from pathlib import Path
 import requests
+import base64
 from typing import Dict, Union
-
+from urllib.parse import urlparse
 from application.parser.file.base_parser import BaseParser
 
 
@@ -21,7 +23,7 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]
         doc2md_service = "https://llm.arc53.com/doc2md"
         # alternatively you can use local vision capable LLM
         with open(file, "rb") as file_loaded:
-            files = {'file': file_loaded}
-            response = requests.post(doc2md_service, files=files)   
-            data = response.json()["markdown"] 
+            files = {"file": file_loaded}
+            response = requests.post(doc2md_service, files=files)
+            data = response.json()["markdown"]
         return data