From db68cb3b1103dfea9b290ce99e1d2ae87bc48844 Mon Sep 17 00:00:00 2001
From: Seyoung Kim <27603734+yy0867@users.noreply.github.com>
Date: Thu, 24 Aug 2023 11:32:46 +0900
Subject: [PATCH] Add PDF Table Reader (#476)

Co-authored-by: seyoung <seyoung@bobidi.com>
Co-authored-by: Jerry Liu <jerryjliu98@gmail.com>
---
 llama_hub/library.json               |  9 ++++
 llama_hub/pdf_table/README.md        | 23 ++++++++++
 llama_hub/pdf_table/__init__.py      |  1 +
 llama_hub/pdf_table/base.py          | 64 ++++++++++++++++++++++++++++
 llama_hub/pdf_table/requirements.txt |  3 ++
 5 files changed, 100 insertions(+)
 create mode 100644 llama_hub/pdf_table/README.md
 create mode 100644 llama_hub/pdf_table/__init__.py
 create mode 100644 llama_hub/pdf_table/base.py
 create mode 100644 llama_hub/pdf_table/requirements.txt

diff --git a/llama_hub/library.json b/llama_hub/library.json
index 5b2728a5d3..e42cd627d9 100644
--- a/llama_hub/library.json
+++ b/llama_hub/library.json
@@ -923,5 +923,14 @@
       "bagelDB",
       "storage"
     ]
+  },
+  "PDFTableReader": {
+    "id": "pdf_table",
+    "author": "yy0867",
+    "keywords": [
+      "table",
+      "pdf",
+      "pdf table"
+    ]
   }
 }
\ No newline at end of file
diff --git a/llama_hub/pdf_table/README.md b/llama_hub/pdf_table/README.md
new file mode 100644
index 0000000000..e347e97bd2
--- /dev/null
+++ b/llama_hub/pdf_table/README.md
@@ -0,0 +1,23 @@
+# PDF Table Loader
+
+This loader reads the tables included in the PDF.
+
+Users can input the PDF `file` and the `pages` from which they want to extract tables, and they can read the tables included on those pages.
+
+## Usage
+
+Here's an example usage of the PDFTableReader.
+`pages` parameter is the same as camelot's `pages`. Therefore, you can use patterns such as `all`, `1,2,3`, `10-20`, and so on.
+
+```python
+from llama_hub.pdf_table.base import PDFTableReader
+from pathlib import Path
+
+reader = PDFTableReader()
+pdf_path = Path('/path/to/pdf')
+documents = reader.load_data(file=pdf_path, pages='80-90')
+```
+
+## Example
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
\ No newline at end of file
diff --git a/llama_hub/pdf_table/__init__.py b/llama_hub/pdf_table/__init__.py
new file mode 100644
index 0000000000..1d4640565a
--- /dev/null
+++ b/llama_hub/pdf_table/__init__.py
@@ -0,0 +1 @@
+"""Init file."""
diff --git a/llama_hub/pdf_table/base.py b/llama_hub/pdf_table/base.py
new file mode 100644
index 0000000000..85e2ca5568
--- /dev/null
+++ b/llama_hub/pdf_table/base.py
@@ -0,0 +1,64 @@
+"""PDF Table reader"""
+import pandas as pd
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+class PDFTableReader(BaseReader):
+    """PDF Table Reader. Reads table from PDF.
+
+    Args:
+        row_separator (str): Row separator used to join rows of a DataFrame.
+        col_separator (str): Col separator used to join columns of a DataFrame.
+    """
+
+    def __init__(
+        self,
+        *args: Any,
+        row_separator: str = '\n',
+        col_separator: str = ', ',
+        **kwargs: Any
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self._row_separator = row_separator
+        self._col_separator = col_separator
+
+    def load_data(
+        self, 
+        file: Path,
+        pages: str = '1',
+        extra_info: Optional[Dict] = None
+    ) -> List[Document]:
+        """Load data and extract table from PDF file.
+
+        Args:
+            file (Path): Path for the PDF file.
+            pages (str): Pages to read tables from.
+            extra_info (Optional[Dict]): Extra informations.
+        
+        Returns:
+            List[Document]: List of documents.
+        """
+        import camelot
+        
+        results = []
+        tables = camelot.read_pdf(filepath=str(file), pages=pages)
+
+        for table in tables:
+            document = self._dataframe_to_document(df=table.df, extra_info=extra_info)
+            results.append(document)
+        
+        return results
+
+    def _dataframe_to_document(self, df: pd.DataFrame, extra_info: Optional[Dict] = None) -> Document:
+        df_list = df.apply(
+            lambda row: (self._col_separator).join(row.astype(str).tolist()), axis=1
+        ).tolist()
+        
+        return Document(
+            text=self._row_separator.join(df_list),
+            extra_info=extra_info or {}
+        )
\ No newline at end of file
diff --git a/llama_hub/pdf_table/requirements.txt b/llama_hub/pdf_table/requirements.txt
new file mode 100644
index 0000000000..f8198e4fd6
--- /dev/null
+++ b/llama_hub/pdf_table/requirements.txt
@@ -0,0 +1,3 @@
+camelot-py
+opencv-python
+ghostscript
\ No newline at end of file