From db68cb3b1103dfea9b290ce99e1d2ae87bc48844 Mon Sep 17 00:00:00 2001 From: Seyoung Kim <27603734+yy0867@users.noreply.github.com> Date: Thu, 24 Aug 2023 11:32:46 +0900 Subject: [PATCH] Add PDF Table Reader (#476) Co-authored-by: seyoung Co-authored-by: Jerry Liu --- llama_hub/library.json | 9 ++++ llama_hub/pdf_table/README.md | 23 ++++++++++ llama_hub/pdf_table/__init__.py | 1 + llama_hub/pdf_table/base.py | 64 ++++++++++++++++++++++++++++ llama_hub/pdf_table/requirements.txt | 3 ++ 5 files changed, 100 insertions(+) create mode 100644 llama_hub/pdf_table/README.md create mode 100644 llama_hub/pdf_table/__init__.py create mode 100644 llama_hub/pdf_table/base.py create mode 100644 llama_hub/pdf_table/requirements.txt diff --git a/llama_hub/library.json b/llama_hub/library.json index 5b2728a5d3..e42cd627d9 100644 --- a/llama_hub/library.json +++ b/llama_hub/library.json @@ -923,5 +923,14 @@ "bagelDB", "storage" ] + }, + "PDFTableReader": { + "id": "pdf_table", + "author": "yy0867", + "keywords": [ + "table", + "pdf", + "pdf table" + ] } } \ No newline at end of file diff --git a/llama_hub/pdf_table/README.md b/llama_hub/pdf_table/README.md new file mode 100644 index 0000000000..e347e97bd2 --- /dev/null +++ b/llama_hub/pdf_table/README.md @@ -0,0 +1,23 @@ +# PDF Table Loader + +This loader reads the tables included in the PDF. + +Users can input the PDF `file` and the `pages` from which they want to extract tables, and they can read the tables included on those pages. + +## Usage + +Here's an example usage of the PDFTableReader. +`pages` parameter is the same as camelot's `pages`. Therefore, you can use patterns such as `all`, `1,2,3`, `10-20`, and so on. + +```python +from llama_hub.pdf_table.base import PDFTableReader +from pathlib import Path + +reader = PDFTableReader() +pdf_path = Path('/path/to/pdf') +documents = reader.load_data(file=pdf_path, pages='80-90') +``` + +## Example + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. \ No newline at end of file diff --git a/llama_hub/pdf_table/__init__.py b/llama_hub/pdf_table/__init__.py new file mode 100644 index 0000000000..1d4640565a --- /dev/null +++ b/llama_hub/pdf_table/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/llama_hub/pdf_table/base.py b/llama_hub/pdf_table/base.py new file mode 100644 index 0000000000..85e2ca5568 --- /dev/null +++ b/llama_hub/pdf_table/base.py @@ -0,0 +1,64 @@ +"""PDF Table reader""" +import pandas as pd + +from pathlib import Path +from typing import Any, Dict, List, Optional + +from llama_index.readers.base import BaseReader +from llama_index.readers.schema.base import Document + +class PDFTableReader(BaseReader): + """PDF Table Reader. Reads table from PDF. + + Args: + row_separator (str): Row separator used to join rows of a DataFrame. + col_separator (str): Col separator used to join columns of a DataFrame. + """ + + def __init__( + self, + *args: Any, + row_separator: str = '\n', + col_separator: str = ', ', + **kwargs: Any + ) -> None: + super().__init__(*args, **kwargs) + self._row_separator = row_separator + self._col_separator = col_separator + + def load_data( + self, + file: Path, + pages: str = '1', + extra_info: Optional[Dict] = None + ) -> List[Document]: + """Load data and extract table from PDF file. + + Args: + file (Path): Path for the PDF file. + pages (str): Pages to read tables from. + extra_info (Optional[Dict]): Extra informations. + + Returns: + List[Document]: List of documents. + """ + import camelot + + results = [] + tables = camelot.read_pdf(filepath=str(file), pages=pages) + + for table in tables: + document = self._dataframe_to_document(df=table.df, extra_info=extra_info) + results.append(document) + + return results + + def _dataframe_to_document(self, df: pd.DataFrame, extra_info: Optional[Dict] = None) -> Document: + df_list = df.apply( + lambda row: (self._col_separator).join(row.astype(str).tolist()), axis=1 + ).tolist() + + return Document( + text=self._row_separator.join(df_list), + extra_info=extra_info or {} + ) \ No newline at end of file diff --git a/llama_hub/pdf_table/requirements.txt b/llama_hub/pdf_table/requirements.txt new file mode 100644 index 0000000000..f8198e4fd6 --- /dev/null +++ b/llama_hub/pdf_table/requirements.txt @@ -0,0 +1,3 @@ +camelot-py +opencv-python +ghostscript \ No newline at end of file