Add PDF Table Reader (#476)

Co-authored-by: seyoung <[email protected]> Co-authored-by: Jerry Liu <[email protected]>
run-llama · Aug 24, 2023 · db68cb3 · db68cb3
1 parent abc8c2d
commit db68cb3
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 0 deletions.
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -923,5 +923,14 @@
       "bagelDB",
       "storage"
     ]
+  },
+  "PDFTableReader": {
+    "id": "pdf_table",
+    "author": "yy0867",
+    "keywords": [
+      "table",
+      "pdf",
+      "pdf table"
+    ]
   }
 }
diff --git a/llama_hub/pdf_table/README.md b/llama_hub/pdf_table/README.md
@@ -0,0 +1,23 @@
+# PDF Table Loader
+
+This loader reads the tables included in the PDF.
+
+Users can input the PDF `file` and the `pages` from which they want to extract tables, and they can read the tables included on those pages.
+
+## Usage
+
+Here's an example usage of the PDFTableReader.
+`pages` parameter is the same as camelot's `pages`. Therefore, you can use patterns such as `all`, `1,2,3`, `10-20`, and so on.
+
+```python
+from llama_hub.pdf_table.base import PDFTableReader
+from pathlib import Path
+
+reader = PDFTableReader()
+pdf_path = Path('/path/to/pdf')
+documents = reader.load_data(file=pdf_path, pages='80-90')
+```
+
+## Example
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
diff --git a/llama_hub/pdf_table/__init__.py b/llama_hub/pdf_table/__init__.py
@@ -0,0 +1 @@
+"""Init file."""
diff --git a/llama_hub/pdf_table/base.py b/llama_hub/pdf_table/base.py
@@ -0,0 +1,64 @@
+"""PDF Table reader"""
+import pandas as pd
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+class PDFTableReader(BaseReader):
+    """PDF Table Reader. Reads table from PDF.
+
+    Args:
+        row_separator (str): Row separator used to join rows of a DataFrame.
+        col_separator (str): Col separator used to join columns of a DataFrame.
+    """
+
+    def __init__(
+        self,
+        *args: Any,
+        row_separator: str = '\n',
+        col_separator: str = ', ',
+        **kwargs: Any
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self._row_separator = row_separator
+        self._col_separator = col_separator
+
+    def load_data(
+        self, 
+        file: Path,
+        pages: str = '1',
+        extra_info: Optional[Dict] = None
+    ) -> List[Document]:
+        """Load data and extract table from PDF file.
+
+        Args:
+            file (Path): Path for the PDF file.
+            pages (str): Pages to read tables from.
+            extra_info (Optional[Dict]): Extra informations.
+        
+        Returns:
+            List[Document]: List of documents.
+        """
+        import camelot
+
+        results = []
+        tables = camelot.read_pdf(filepath=str(file), pages=pages)
+
+        for table in tables:
+            document = self._dataframe_to_document(df=table.df, extra_info=extra_info)
+            results.append(document)
+
+        return results
+
+    def _dataframe_to_document(self, df: pd.DataFrame, extra_info: Optional[Dict] = None) -> Document:
+        df_list = df.apply(
+            lambda row: (self._col_separator).join(row.astype(str).tolist()), axis=1
+        ).tolist()
+
+        return Document(
+            text=self._row_separator.join(df_list),
+            extra_info=extra_info or {}
+        )
diff --git a/llama_hub/pdf_table/requirements.txt b/llama_hub/pdf_table/requirements.txt
@@ -0,0 +1,3 @@
+camelot-py
+opencv-python
+ghostscript