diff --git a/llama_hub/hangeul/README.md b/llama_hub/hangeul/README.md deleted file mode 100644 index 9e638af18c..0000000000 --- a/llama_hub/hangeul/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# HWP Loader - -This loader reads the HWP file, which is the format of many official documents in South Korea. - -## Usage - -To use this loader, you need to pass in a file name. It's fine whether the file is compressed or not. - -```python -from llama_hub.hangeul.base import HWPReader -from pathlib import Path - -hwp_path = Path('/path/to/hwp') -reader = HWPReader() -documents = reader.load_data(file=hwp_path) -``` diff --git a/llama_hub/hangeul/__init__.py b/llama_hub/hangeul/__init__.py deleted file mode 100644 index 1d4640565a..0000000000 --- a/llama_hub/hangeul/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Init file.""" diff --git a/llama_hub/hangeul/base.py b/llama_hub/hangeul/base.py deleted file mode 100644 index 2f5ee5f887..0000000000 --- a/llama_hub/hangeul/base.py +++ /dev/null @@ -1,112 +0,0 @@ -import olefile -import zlib -import struct - -from pathlib import Path -from typing import Any, Dict, List, Optional - -from llama_index.readers.base import BaseReader -from llama_index.readers.schema.base import Document - -class HWPReader(BaseReader): - """Hangeul Reader. Reads contents from Hangeul file. - Args: None - """ - def __init__( - self, - *args: Any, - **kwargs: Any - ) -> None: - super().__init__(*args, **kwargs) - self.FILE_HEADER_SECTION = "FileHeader" - self.HWP_SUMMARY_SECTION = "\x05HwpSummaryInformation" - self.SECTION_NAME_LENGTH = len("Section") - self.BODYTEXT_SECTION = "BodyText" - self.HWP_TEXT_TAGS = [67] - - def load_data( - self, - file: Path, - ) -> Document: - """Load data and extract table from PDF file. - - Args: - file (Path): Path for the PDF file. - - Returns: - List[Document]: List of documents. - """ - load_file = olefile.OleFileIO(file) - file_dir = load_file.listdir() - - if self.is_valid(file_dir) == False: - raise Exception("Not Valid HwpFile") - - result_text = self._get_text(load_file, file_dir) - result = self._text_to_document(text=result_text) - return result - - def is_valid(self, dirs): - if [self.FILE_HEADER_SECTION] not in dirs: - return False - - return [self.HWP_SUMMARY_SECTION] in dirs - - def get_body_sections(self, dirs): - m = [] - for d in dirs: - if d[0] == self.BODYTEXT_SECTION: - m.append(int(d[1][self.SECTION_NAME_LENGTH:])) - - return ["BodyText/Section"+str(x) for x in sorted(m)] - - def _text_to_document(self, text: str, extra_info: Optional[Dict] = None) -> Document: - - return Document( - text=text, - extra_info=extra_info or {} - ) - - def get_text(self): - return self.text - - # 전체 text 추출 - def _get_text(self, load_file, file_dir): - sections = self.get_body_sections(file_dir) - text = "" - for section in sections: - text += self.get_text_from_section(load_file, section) - text += "\n" - - self.text = text - return self.text - - def is_compressed(self, load_file): - header = load_file.openstream("FileHeader") - header_data = header.read() - return (header_data[36] & 1) == 1 - - def get_text_from_section(self, load_file, section): - bodytext = load_file.openstream(section) - data = bodytext.read() - - unpacked_data = zlib.decompress(data, -15) if self.is_compressed(load_file) else data - size = len(unpacked_data) - - i = 0 - - text = "" - while i < size: - header = struct.unpack_from("> 10) & 0x3ff - rec_len = (header >> 20) & 0xfff - - if rec_type in self.HWP_TEXT_TAGS: - rec_data = unpacked_data[i+4:i+4+rec_len] - text += rec_data.decode('utf-16') - text += "\n" - - i += 4 + rec_len - - return text \ No newline at end of file diff --git a/llama_hub/hangeul/requirements.txt b/llama_hub/hangeul/requirements.txt deleted file mode 100644 index a2d775acd9..0000000000 --- a/llama_hub/hangeul/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -olefile diff --git a/llama_hub/hwp/base.py b/llama_hub/hwp/base.py index 2cd9afe747..63830aba1f 100644 --- a/llama_hub/hwp/base.py +++ b/llama_hub/hwp/base.py @@ -8,7 +8,7 @@ from llama_index.readers.schema.base import Document class HWPReader(BaseReader): - """Hangeul Reader. Reads contents from Hangeul file. + """Hwp Reader. Reads contents from Hwp file. Args: None """ def __init__( @@ -26,14 +26,13 @@ def __init__( def load_data( self, file: Path, + extra_info: Optional[Dict] = None ) -> Document: - """Load data and extract table from PDF file. - + """Load data and extract table from Hwp file. Args: - file (Path): Path for the PDF file. - + file (Path): Path for the Hwp file. Returns: - List[Document]: List of documents. + Document """ import olefile @@ -44,7 +43,7 @@ def load_data( raise Exception("Not Valid HwpFile") result_text = self._get_text(load_file, file_dir) - result = self._text_to_document(text=result_text) + result = self._text_to_document(text=result_text, extra_info=extra_info) return result def is_valid(self, dirs): diff --git a/llama_hub/library.json b/llama_hub/library.json index 77328b1966..58e52d4680 100644 --- a/llama_hub/library.json +++ b/llama_hub/library.json @@ -939,5 +939,12 @@ "keywords": [ "linear" ] + }, + "HWPReader": { + "id": "hwp", + "author": "sangwongenip", + "keywords": [ + "hwp" + ] } } \ No newline at end of file diff --git a/test_requirements.txt b/test_requirements.txt index 3368b85e4c..5c52aac06c 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -11,6 +11,8 @@ typing_extensions==4.5.0 llama-index>=0.6.9 atlassian-python-api html2text +olefile + # hotfix psutil