diff --git a/.flake8 b/.flake8 index 804c52e0..401f544b 100644 --- a/.flake8 +++ b/.flake8 @@ -49,3 +49,4 @@ per-file-ignores = scripts/benchmark_pdf_performance*:JS101 tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802 docs/source/_static/code_examples/*:I251 + docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251 diff --git a/Dockerfile b/Dockerfile index 3d00dea6..cdef9746 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,25 +1,22 @@ ARG REPOSITORY="docker.io" FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 ARG LANGUAGES="" -RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done +RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$(echo $lang | tr "_" "-"); done ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root" ENV RESOURCES_PATH "/dedoc_root/resources" -ADD requirements.txt . +COPY requirements.txt . RUN pip3 install --no-cache-dir -r requirements.txt RUN mkdir /dedoc_root RUN mkdir /dedoc_root/dedoc -ADD dedoc/config.py /dedoc_root/dedoc/config.py -ADD dedoc/download_models.py /dedoc_root/dedoc/download_models.py +COPY dedoc/config.py /dedoc_root/dedoc/config.py +COPY dedoc/download_models.py /dedoc_root/dedoc/download_models.py RUN python3 /dedoc_root/dedoc/download_models.py -ADD dedoc /dedoc_root/dedoc -ADD VERSION /dedoc_root +COPY dedoc /dedoc_root/dedoc +COPY VERSION /dedoc_root RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py -ADD tests /dedoc_root/tests -ADD resources /dedoc_root/resources - -CMD ["python3", "/dedoc_root/dedoc/main.py"] +CMD [ "python3", "/dedoc_root/dedoc/main.py" ] diff --git a/VERSION b/VERSION index 04761555..ecf00d90 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.4 \ No newline at end of file +2.2.5 \ No newline at end of file diff --git a/dedoc/download_models.py b/dedoc/download_models.py index 7fa611bd..fa209b14 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -5,12 +5,12 @@ Keys are the names of repositories with models. """ model_hash_dict = dict( - txtlayer_classifier="94e27e184fa2876883d260e0aa58b042e6ab3e35", + txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f", scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58", font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07", - paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864", - line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013", - fintoc_classifiers="42f8ada99a5da608139b078c93bebfffc5b30263" + paragraph_classifier="c26a10193499d3cbc77ffec9842bece24fa8950b", + line_type_classifiers="0568c6e1f49612c0c351f10b80a26dc05f796683", + fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8" ) @@ -27,29 +27,29 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str def download(resources_path: str) -> None: import os - download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz") + download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.json", repo_name="txtlayer_classifier", hub_name="model.json") download_from_hub(out_dir=resources_path, out_name="scan_orientation_efficient_net_b0.pth", repo_name="scan_orientation_efficient_net_b0", hub_name="model.pth") - download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.pkl.gz", repo_name="paragraph_classifier", hub_name="model.pkl.gz") + download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.zip", repo_name="paragraph_classifier", hub_name="model.zip") line_clf_resources_path = os.path.join(resources_path, "line_type_classifiers") for classifier_type in ("diploma", "law", "law_txt", "tz", "tz_txt"): download_from_hub(out_dir=line_clf_resources_path, - out_name=f"{classifier_type}_classifier.pkl.gz", + out_name=f"{classifier_type}_classifier.zip", repo_name="line_type_classifiers", - hub_name=f"{classifier_type}.pkl.gz") + hub_name=f"{classifier_type}.zip") fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers") for language in ("en", "fr", "sp"): for classifier_type in ("target", "binary"): download_from_hub(out_dir=fintoc_classifiers_resources_path, - out_name=f"{classifier_type}_classifier_{language}.pkg.gz", + out_name=f"{classifier_type}_classifier_{language}.json", repo_name="fintoc_classifiers", - hub_name=f"{classifier_type}_classifier_{language}_txt_layer.pkg.gz") + hub_name=f"{classifier_type}_classifier_{language}_txt_layer.json") if __name__ == "__main__": diff --git a/dedoc/extensions.py b/dedoc/extensions.py index 069642e0..817e1305 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -19,7 +19,7 @@ converted_extensions = Extensions( - excel_like_format={".ods", "xls"}, + excel_like_format={".ods", ".xls"}, docx_like_format={".odt", ".doc", ".rtf"}, pptx_like_format={".odp", ".ppt"}, html_like_format={}, diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py index 8d3ec876..97cefe5b 100644 --- a/dedoc/readers/email_reader/email_reader.py +++ b/dedoc/readers/email_reader/email_reader.py @@ -44,11 +44,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure import os import uuid from dedoc.data_structures.attached_file import AttachedFile - from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments from dedoc.utils.utils import get_unique_name parameters = {} if parameters is None else parameters attachments_dir = get_param_attachments_dir(parameters, file_path) + with_attachments = get_param_with_attachments(parameters) + need_content_analysis = get_param_need_content_analysis(parameters) with open(file_path, "rb") as f: msg = email.message_from_binary_file(f) @@ -58,16 +60,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure lines = self.__get_main_fields(msg) header_filename = "message_header_" + get_unique_name("message_header.json") - # saving message header into separated file as an attachment - header_file_path = os.path.join(attachments_dir, header_filename) - with open(header_file_path, "w", encoding="utf-8") as f: - json.dump(all_header_fields, f, ensure_ascii=False, indent=4) - - need_content_analysis = get_param_need_content_analysis(parameters) - attachments.append(AttachedFile(original_name=header_filename, - tmp_file_path=header_file_path, - uid=f"attach_{uuid.uuid1()}", - need_content_analysis=need_content_analysis)) + if with_attachments: + # saving message header into separated file as an attachment + header_file_path = os.path.join(attachments_dir, header_filename) + with open(header_file_path, "w", encoding="utf-8") as f: + json.dump(all_header_fields, f, ensure_ascii=False, indent=4) + attachments.append(AttachedFile(original_name=header_filename, + tmp_file_path=header_file_path, + uid=f"attach_{uuid.uuid1()}", + need_content_analysis=need_content_analysis)) html_found = False text_parts = [] @@ -92,7 +93,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure if part.is_multipart(): continue - self.__add_attachment(part, attachments_dir, attachments, need_content_analysis) + if with_attachments: + self.__add_attachment(part, attachments_dir, attachments, need_content_analysis) # text/plain has the same content as text/html if not html_found: diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py index 7073ee54..0e87feb5 100644 --- a/dedoc/readers/mhtml_reader/mhtml_reader.py +++ b/dedoc/readers/mhtml_reader/mhtml_reader.py @@ -36,7 +36,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ - from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments parameters = {} if parameters is None else parameters attachments_dir = get_param_attachments_dir(parameters, file_path) @@ -51,7 +51,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure lines.extend(result.lines) tables.extend(result.tables) - need_content_analysis = get_param_need_content_analysis(parameters) tmp_file_names = [] original_file_names = [] for tmp_file_name, original_file_name in zip(names_list, original_names_list): @@ -59,8 +58,14 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure tmp_file_names.append(tmp_file_name) original_file_names.append(original_file_name) - attachments = self.__get_attachments(save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names, - need_content_analysis=need_content_analysis) + with_attachments = get_param_with_attachments(parameters) + need_content_analysis = get_param_need_content_analysis(parameters) + if with_attachments: + attachments = self.__get_attachments( + save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names, need_content_analysis=need_content_analysis + ) + else: + attachments = [] return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py index 21716598..31fb7b9b 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py @@ -1,7 +1,5 @@ -import gzip import logging import os -import pickle from typing import List from xgboost import XGBClassifier @@ -22,7 +20,7 @@ def __init__(self, *, config: dict) -> None: self.logger = config.get("logger", logging.getLogger()) self.feature_extractor = TxtlayerFeatureExtractor() - self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.pkl.gz") + self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.json") self.__model = None @property @@ -32,11 +30,11 @@ def __get_model(self) -> XGBClassifier: if not os.path.isfile(self.path): out_dir, out_name = os.path.split(self.path) - download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.pkl.gz") + download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.json") assert os.path.isfile(self.path) - with gzip.open(self.path, "rb") as f: - self.__model = pickle.load(f) + self.__model = XGBClassifier() + self.__model.load_model(self.path) if get_param_gpu_available(self.config, self.logger): gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py index 76d37062..ba6c359c 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py @@ -1,7 +1,8 @@ -import gzip +import json import logging import os -import pickle +import tempfile +import zipfile from typing import List from xgboost import XGBClassifier @@ -21,7 +22,7 @@ class ScanParagraphClassifierExtractor(object): def __init__(self, *, config: dict) -> None: super().__init__() self.logger = config.get("logger", logging.getLogger()) - self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.pkl.gz") + self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.zip") self.config = config self._feature_extractor = None self._classifier = None @@ -41,11 +42,17 @@ def classifier(self) -> XGBClassifier: def _unpickle(self) -> None: if not os.path.isfile(self.path): out_dir, out_name = os.path.split(self.path) - download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.pkl.gz") + download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.zip") - with gzip.open(self.path) as file: - self._classifier, parameters = pickle.load(file) - self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config) + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(self.path) as archive: + archive.extractall(tmpdir) + + with open(os.path.join(tmpdir, "parameters.json")) as parameters_file: + parameters = json.load(parameters_file) + self._classifier = XGBClassifier() + self._classifier.load_model(os.path.join(tmpdir, "classifier.json")) + self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config) if get_param_gpu_available(self.config, self.logger): gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0) diff --git a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py index 16b7794d..0638e00c 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py @@ -26,8 +26,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None: from dedoc.structure_extractors.line_type_classifiers.law_classifier import LawLineTypeClassifier path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=self.config) - self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=self.config) + self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.zip"), config=self.config) + self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.zip"), config=self.config) self.hierarchy_level_builders = [StubHierarchyLevelBuilder()] self.hl_type = "law" self.init_hl_depth = 1 diff --git a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py index 28508d50..591a1c16 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py @@ -32,7 +32,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.toc_builder = TocBuilder() self.body_builder = DiplomaBodyBuilder() path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=self.config) + self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.zip"), config=self.config) self.footnote_start_regexp = re.compile(r"^\d+ ") def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: diff --git a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py index 15cc3d86..0429bbab 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py @@ -29,8 +29,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.body_builder = TzBodyBuilder() self.toc_builder = TocBuilder() path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=self.config) - self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=self.config) + self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.zip"), config=self.config) + self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.zip"), config=self.config) def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ diff --git a/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py b/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py index e5e62aef..1d79b29c 100644 --- a/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py @@ -1,7 +1,8 @@ -import gzip +import json import logging import os -import pickle +import tempfile +import zipfile from abc import ABC from typing import Optional, Tuple @@ -32,10 +33,16 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]: """ if not os.path.isfile(path): out_dir, out_name = os.path.split(path) - download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.pkl.gz") + download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.zip") - with gzip.open(path) as file: - classifier, feature_extractor_parameters = pickle.load(file) + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(path) as archive: + archive.extractall(tmpdir) + + with open(os.path.join(tmpdir, "parameters.json")) as parameters_file: + feature_extractor_parameters = json.load(parameters_file) + classifier = XGBClassifier() + classifier.load_model(os.path.join(tmpdir, "classifier.json")) if get_param_gpu_available(self.config, self.logger): gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0) @@ -44,19 +51,27 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]: return classifier, feature_extractor_parameters - def save(self, path_out: str, object_for_saving: object) -> str: + @staticmethod + def save(path_out: str, classifier: XGBClassifier, parameters: dict) -> str: """ - Save the pickled classifier (with initialization parameters for a feature extractor) into the `.pkl.gz` file with path=`path_out` + Save the classifier (with initialization parameters for a feature extractor) into the `.zip` file with path=`path_out` + + * classifier -> classifier.json + * parameters -> parameters.json :param path_out: path (with file name) where to save the object - :param object_for_saving: classifier with feature extractor's parameters to save + :param classifier: classifier to save + :param parameters: feature extractor parameters to save :return: the resulting path of the saved file """ - if path_out.endswith(".pkl"): - path_out += ".gz" - elif not path_out.endswith(".gz"): - path_out += ".pkl.gz" + with tempfile.TemporaryDirectory() as tmpdir: + clf_path = os.path.join(tmpdir, "classifier.json") + params_path = os.path.join(tmpdir, "parameters.json") + classifier.save_model(clf_path) + with open(params_path, "w") as out_file: + json.dump(parameters, out_file) - with gzip.open(path_out, "wb") as file_out: - pickle.dump(obj=object_for_saving, file=file_out) + with zipfile.ZipFile(path_out, "w") as archive: + archive.write(clf_path, os.path.basename(clf_path)) + archive.write(params_path, os.path.basename(params_path)) return path_out diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py index 9e00e819..43c7100b 100755 --- a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py @@ -1,7 +1,5 @@ -import gzip import logging import os -import pickle from typing import Dict, List, Optional, Union import numpy as np @@ -60,8 +58,7 @@ def fit(self, def save(self, classifiers_dir_path: str, features_importances_dir_path: str, logger: logging.Logger, features_names: List[str], reader: str) -> None: os.makedirs(classifiers_dir_path, exist_ok=True) for classifier_type in ("binary", "target"): - with gzip.open(os.path.join(classifiers_dir_path, f"{classifier_type}_classifier_{self.language}_{reader}.pkg.gz"), "wb") as output_file: - pickle.dump(self.classifiers[classifier_type], output_file) + self.classifiers[classifier_type].save_model(os.path.join(classifiers_dir_path, f"{classifier_type}_classifier_{self.language}_{reader}.json")) logger.info(f"Classifiers were saved in {classifiers_dir_path} directory") os.makedirs(features_importances_dir_path, exist_ok=True) @@ -81,15 +78,16 @@ def target_classifier(self) -> XGBClassifier: def __lazy_load_weights(self, classifier_type: str) -> XGBClassifier: if self.classifiers[classifier_type] is None: assert self.weights_dir_path is not None - file_name = f"{classifier_type}_classifier_{self.language}.pkg.gz" + file_name = f"{classifier_type}_classifier_{self.language}.json" classifier_path = os.path.join(self.weights_dir_path, file_name) if not os.path.isfile(classifier_path): download_from_hub(out_dir=self.weights_dir_path, out_name=file_name, repo_name="fintoc_classifiers", - hub_name=f"{classifier_type}_classifier_{self.language}_txt_layer.pkg.gz") + hub_name=f"{classifier_type}_classifier_{self.language}_txt_layer.json") - with gzip.open(classifier_path, "rb") as input_file: - self.classifiers[classifier_type] = pickle.load(file=input_file) + classifier = XGBClassifier() + classifier.load_model(classifier_path) + self.classifiers[classifier_type] = classifier return self.classifiers[classifier_type] diff --git a/dedoc/utils/langchain.py b/dedoc/utils/langchain.py new file mode 100644 index 00000000..a006ede4 --- /dev/null +++ b/dedoc/utils/langchain.py @@ -0,0 +1,239 @@ +from dedoc.extensions import converted_extensions, recognized_extensions + + +supported_extensions = { + format_group: {*recognized_extensions._asdict()[format_group], *converted_extensions._asdict()[format_group]} + for format_group in recognized_extensions._asdict().keys() +} + + +def make_manager_config(file_path: str, split: str, parsing_params: dict) -> dict: # noqa: C901 + from dedoc.utils.parameter_utils import get_param_with_attachments + from dedoc.utils.utils import get_mime_extension + from dedoc.common.exceptions.bad_file_error import BadFileFormatError + + if get_param_with_attachments(parsing_params): + return make_minimal_manager_config(split, parsing_params) + + mime, extension = get_mime_extension(file_path=file_path) + + if extension in supported_extensions["excel_like_format"]: + from dedoc.converters.concrete_converters.excel_converter import ExcelConverter + from dedoc.readers.excel_reader.excel_reader import ExcelReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = ExcelConverter(), ExcelReader(), BaseMetadataExtractor() + elif extension in supported_extensions["docx_like_format"]: + from dedoc.converters.concrete_converters.docx_converter import DocxConverter + from dedoc.readers.docx_reader.docx_reader import DocxReader + from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor + converter, reader, metadata_extractor = DocxConverter(), DocxReader(), DocxMetadataExtractor() + elif extension in supported_extensions["pptx_like_format"]: + from dedoc.converters.concrete_converters.pptx_converter import PptxConverter + from dedoc.readers.pptx_reader.pptx_reader import PptxReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = PptxConverter(), PptxReader(), BaseMetadataExtractor() + elif extension in supported_extensions["html_like_format"]: + from dedoc.readers.html_reader.html_reader import HtmlReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, HtmlReader(), BaseMetadataExtractor() + elif extension in supported_extensions["eml_like_format"]: + from dedoc.readers.email_reader.email_reader import EmailReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, EmailReader(), BaseMetadataExtractor() + elif extension in supported_extensions["mhtml_like_format"]: + from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, MhtmlReader(), BaseMetadataExtractor() + elif extension in supported_extensions["archive_like_format"]: + from dedoc.readers.archive_reader.archive_reader import ArchiveReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, ArchiveReader(), BaseMetadataExtractor() + elif extension in supported_extensions["image_like_format"]: + from dedoc.converters.concrete_converters.png_converter import PNGConverter + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor + converter, reader, metadata_extractor = PNGConverter(), PdfImageReader(), ImageMetadataExtractor() + elif extension in supported_extensions["pdf_like_format"]: + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params) + if pdf_with_text_layer == "true": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + converter, reader, metadata_extractor = PDFConverter(), PdfTxtlayerReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "tabby": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + converter, reader, metadata_extractor = PDFConverter(), PdfTabbyReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "false": + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + converter, reader, metadata_extractor = PDFConverter(), PdfImageReader(), PdfMetadataExtractor() + else: + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + converter, reader, metadata_extractor = PDFConverter(), PdfAutoReader(), PdfMetadataExtractor() + elif extension in supported_extensions["csv_like_format"]: + from dedoc.readers.csv_reader.csv_reader import CSVReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, CSVReader(), BaseMetadataExtractor() + elif extension in supported_extensions["txt_like_format"]: + from dedoc.converters.concrete_converters.txt_converter import TxtConverter + from dedoc.readers.txt_reader.raw_text_reader import RawTextReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = TxtConverter(), RawTextReader(), BaseMetadataExtractor() + elif extension in supported_extensions["json_like_format"]: + from dedoc.readers.json_reader.json_reader import JsonReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, JsonReader(), BaseMetadataExtractor() + else: + raise BadFileFormatError(f'Could not find the suitable reader for the file with mime = "{mime}", extension = "{extension}".') + + if split == "node": + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor() + else: + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor() + + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + + manager_config = dict( + converter=ConverterComposition(converters=[converter] if converter else []), + reader=ReaderComposition(readers=[reader]), + structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"), + structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor), + document_metadata_extractor=MetadataExtractorComposition(extractors=[metadata_extractor]), + attachments_handler=AttachmentsHandler() + ) + return manager_config + + +def make_manager_pdf_config(file_path: str, split: str, parsing_params: dict) -> dict: # noqa: C901 + from dedoc.utils.parameter_utils import get_param_with_attachments + from dedoc.utils.utils import get_mime_extension + from dedoc.common.exceptions.bad_file_error import BadFileFormatError + + if get_param_with_attachments(parsing_params): + return make_minimal_manager_config(split, parsing_params) + + mime, extension = get_mime_extension(file_path=file_path) + + if extension in supported_extensions["pdf_like_format"]: + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params) + if pdf_with_text_layer == "true": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + converter, reader, metadata_extractor = PDFConverter(), PdfTxtlayerReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "tabby": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + converter, reader, metadata_extractor = PDFConverter(), PdfTabbyReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "false": + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + converter, reader, metadata_extractor = PDFConverter(), PdfImageReader(), PdfMetadataExtractor() + else: + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + converter, reader, metadata_extractor = PDFConverter(), PdfAutoReader(), PdfMetadataExtractor() + else: + raise BadFileFormatError(f'Could not find the suitable reader for the file with mime = "{mime}", extension = "{extension}".') # noqa: T201 + + if split == "node": + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor() + else: + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor() + + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + + manager_config = dict( + converter=ConverterComposition(converters=[converter]), + reader=ReaderComposition(readers=[reader]), + structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"), + structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor), + document_metadata_extractor=MetadataExtractorComposition(extractors=[metadata_extractor]), + attachments_handler=AttachmentsHandler() + ) + return manager_config + + +def make_minimal_manager_config(split: str, parsing_params: dict) -> dict: # noqa: C901 + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.converters.concrete_converters.binary_converter import BinaryConverter + from dedoc.converters.concrete_converters.docx_converter import DocxConverter + from dedoc.converters.concrete_converters.excel_converter import ExcelConverter + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.converters.concrete_converters.png_converter import PNGConverter + from dedoc.converters.concrete_converters.pptx_converter import PptxConverter + from dedoc.converters.concrete_converters.txt_converter import TxtConverter + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.note_metadata_extarctor import NoteMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + from dedoc.readers.archive_reader.archive_reader import ArchiveReader + from dedoc.readers.csv_reader.csv_reader import CSVReader + from dedoc.readers.docx_reader.docx_reader import DocxReader + from dedoc.readers.email_reader.email_reader import EmailReader + from dedoc.readers.excel_reader.excel_reader import ExcelReader + from dedoc.readers.html_reader.html_reader import HtmlReader + from dedoc.readers.json_reader.json_reader import JsonReader + from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader + from dedoc.readers.note_reader.note_reader import NoteReader + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.readers.pptx_reader.pptx_reader import PptxReader + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.readers.txt_reader.raw_text_reader import RawTextReader + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + + converters = [DocxConverter(), ExcelConverter(), PptxConverter(), TxtConverter(), PDFConverter(), PNGConverter(), BinaryConverter()] + readers = [] + pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params) + if pdf_with_text_layer == "true": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + readers.append(PdfTxtlayerReader()) + elif pdf_with_text_layer == "tabby": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + readers.append(PdfTabbyReader()) + elif pdf_with_text_layer != "false": + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + readers.append(PdfAutoReader()) + + readers.extend([ + DocxReader(), ExcelReader(), PptxReader(), RawTextReader(), CSVReader(), HtmlReader(), NoteReader(), JsonReader(), ArchiveReader(), PdfImageReader(), + EmailReader(), MhtmlReader() + ]) + + metadata_extractors = [DocxMetadataExtractor(), PdfMetadataExtractor(), ImageMetadataExtractor(), NoteMetadataExtractor(), BaseMetadataExtractor()] + + if split == "node": + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor() + else: + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor() + + return dict( + converter=ConverterComposition(converters=converters), + reader=ReaderComposition(readers=readers), + structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"), + structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor), + document_metadata_extractor=MetadataExtractorComposition(extractors=metadata_extractors), + attachments_handler=AttachmentsHandler() + ) diff --git a/docker-compose.yml b/docker-compose.yml index 58d36ef2..88ea8a14 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,6 +21,7 @@ services: - dedoc build: context: . + dockerfile: tests/Dockerfile tty: true environment: DOC_READER_HOST: "dedoc" @@ -28,9 +29,6 @@ services: GROBID_HOST: "grobid" GROBID_PORT: 8070 is_test: $test - PYTHONPATH: $PYTHONPATH:/dedoc_root/tests:/dedoc_root - command: - bash dedoc_root/tests/run_tests_in_docker.sh grobid: image: "lfoppiano/grobid:0.8.0" diff --git a/docs/source/_static/code_examples/article_classifier.pkl.gz b/docs/source/_static/code_examples/article_classifier.pkl.gz deleted file mode 100644 index a2f2355b..00000000 Binary files a/docs/source/_static/code_examples/article_classifier.pkl.gz and /dev/null differ diff --git a/docs/source/_static/code_examples/article_classifier.zip b/docs/source/_static/code_examples/article_classifier.zip new file mode 100644 index 00000000..98107cf1 Binary files /dev/null and b/docs/source/_static/code_examples/article_classifier.zip differ diff --git a/docs/source/_static/code_examples/article_structure_extractor.py b/docs/source/_static/code_examples/article_structure_extractor.py index cbb5d6cd..f59fb6a1 100644 --- a/docs/source/_static/code_examples/article_structure_extractor.py +++ b/docs/source/_static/code_examples/article_structure_extractor.py @@ -19,7 +19,7 @@ class ArticleStructureExtractor(AbstractStructureExtractor): def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) path = os.path.abspath(os.path.dirname(__file__)) # path to the directory where the classifier weights are located - self.classifier = ArticleLineTypeClassifier(path=os.path.join(path, "article_classifier.pkl.gz"), config=self.config) + self.classifier = ArticleLineTypeClassifier(path=os.path.join(path, "article_classifier.zip"), config=self.config) self.named_item_keywords = ("abstract", "introduction", "related work", "conclusion", "references", "appendix", "acknowledgements") diff --git a/docs/source/_static/code_examples/langchain/dedoc_loader.py b/docs/source/_static/code_examples/langchain/dedoc_loader.py new file mode 100644 index 00000000..2e037694 --- /dev/null +++ b/docs/source/_static/code_examples/langchain/dedoc_loader.py @@ -0,0 +1,416 @@ +from abc import ABC, abstractmethod +from typing import ( + Dict, + Iterator, + Optional, + Tuple, + Union, +) + +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader + + +class DedocBaseLoader(BaseLoader, ABC): + """ + Base Loader that uses `dedoc` (https://dedoc.readthedocs.io). + + Loader enables extracting text, tables and attached files from the given file: + * `Text` can be split by pages, `dedoc` tree nodes, textual lines + (according to the `split` parameter). + * `Attached files` (when with_attachments=True) + are split according to the `split` parameter. + For attachments, langchain Document object has an additional metadata field + `type`="attachment". + * `Tables` (when with_tables=True) are not split - each table corresponds to one + langchain Document object. + For tables, Document object has additional metadata fields `type`="table" + and `text_as_html` with table HTML representation. + """ + + def __init__( + self, + file_path: str, + split: str = "document", + with_tables: bool = True, + **dedoc_kwargs: Union[str, bool], + ) -> None: + """ + Initialize with file path and parsing parameters. + + Args: + file_path: path to the file for processing + split: type of document splitting into parts (each part is returned + separately), default value "document" + "document": document text is returned as a single langchain Document + object (don't split) + "page": split document text into pages (works for PDF, DJVU, PPTX, PPT, + ODP) + "node": split document text into tree nodes (title nodes, list item + nodes, raw text nodes) + "line": split document text into lines + with_tables: add tables to the result - each table is returned as a single + langchain Document object + + dedoc_kwargs: parameters used for document parsing via `dedoc` + (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html). + with_attachments: enable attached files extraction + recursion_deep_attachments: recursion level for attached files + extraction, works only when with_attachments==True + pdf_with_text_layer: type of handler for parsing PDF documents, + available options + ["true", "false", "tabby", "auto", "auto_tabby" (default)] + language: language of the document for PDF without a textual layer and + images, available options ["eng", "rus", "rus+eng" (default)], + the list of languages can be extended, please see + https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html + pages: page slice to define the reading range for parsing PDF documents + is_one_column_document: detect number of columns for PDF without + a textual layer and images, available options + ["true", "false", "auto" (default)] + document_orientation: fix document orientation (90, 180, 270 degrees) + for PDF without a textual layer and images, available options + ["auto" (default), "no_change"] + need_header_footer_analysis: remove headers and footers from the output + result for parsing PDF and images + need_binarization: clean pages background (binarize) for PDF without a + textual layer and images + need_pdf_table_analysis: parse tables for PDF without a textual layer + and images + delimiter: column separator for CSV, TSV files + encoding: encoding of TXT, CSV, TSV + """ + self.valid_split_values = {"document", "page", "node", "line"} + if split not in self.valid_split_values: + raise ValueError( + f"Got {split} for `split`, but should be one of " + f"`{self.valid_split_values}`" + ) + self.split = split + + self.with_tables = with_tables + self.file_path = file_path + with_attachments = str(dedoc_kwargs.get("with_attachments", "false")).lower() + self.parsing_parameters = { + **dedoc_kwargs, + **{ + "structure_type": "tree" if self.split == "node" else "linear", + "document_type": "other", + "need_content_analysis": with_attachments, + }, + } + + def lazy_load(self) -> Iterator[Document]: + """Lazily load documents.""" + import tempfile + + try: + from dedoc import DedocManager + except ImportError: + raise ImportError( + "`dedoc` package not found, please install it with `pip install dedoc`" + ) + dedoc_manager = DedocManager(manager_config=self._make_config()) + dedoc_manager.config["logger"].disabled = True + + with tempfile.TemporaryDirectory() as tmpdir: + document_tree = dedoc_manager.parse( + file_path=self.file_path, + parameters={**self.parsing_parameters, "attachments_dir": tmpdir}, + ) + yield from self._split_document( + document_tree=document_tree.to_api_schema().dict(), split=self.split + ) + + @abstractmethod + def _make_config(self) -> dict: + """ + Make configuration for DedocManager according to the file extension and + parsing parameters. + """ + pass + + def _json2txt(self, paragraph: dict) -> str: + """Get text (recursively) of the document tree node.""" + subparagraphs_text = "\n".join( + [ + self._json2txt(subparagraph) + for subparagraph in paragraph["subparagraphs"] + ] + ) + text = ( + f"{paragraph['text']}\n{subparagraphs_text}" + if subparagraphs_text + else paragraph["text"] + ) + return text + + def _parse_subparagraphs( + self, document_tree: dict, document_metadata: dict + ) -> Iterator[Document]: + """Parse recursively document tree obtained by `dedoc`.""" + if len(document_tree["subparagraphs"]) > 0: + for subparagraph in document_tree["subparagraphs"]: + yield from self._parse_subparagraphs( + document_tree=subparagraph, document_metadata=document_metadata + ) + else: + yield Document( + page_content=document_tree["text"], + metadata={**document_metadata, **document_tree["metadata"]}, + ) + + def _split_document( + self, + document_tree: dict, + split: str, + additional_metadata: Optional[dict] = None, + ) -> Iterator[Document]: + """Split document into parts according to the `split` parameter.""" + document_metadata = document_tree["metadata"] + if additional_metadata: + document_metadata = {**document_metadata, **additional_metadata} + + if split == "document": + text = self._json2txt(paragraph=document_tree["content"]["structure"]) + yield Document(page_content=text, metadata=document_metadata) + + elif split == "page": + nodes = document_tree["content"]["structure"]["subparagraphs"] + page_id = nodes[0]["metadata"]["page_id"] + page_text = "" + + for node in nodes: + if node["metadata"]["page_id"] == page_id: + page_text += self._json2txt(node) + else: + yield Document( + page_content=page_text, + metadata={**document_metadata, "page_id": page_id}, + ) + page_id = node["metadata"]["page_id"] + page_text = self._json2txt(node) + + yield Document( + page_content=page_text, + metadata={**document_metadata, "page_id": page_id}, + ) + + elif split == "line": + for node in document_tree["content"]["structure"]["subparagraphs"]: + line_metadata = node["metadata"] + yield Document( + page_content=self._json2txt(node), + metadata={**document_metadata, **line_metadata}, + ) + + elif split == "node": + yield from self._parse_subparagraphs( + document_tree=document_tree["content"]["structure"], + document_metadata=document_metadata, + ) + + else: + raise ValueError( + f"Got {split} for `split`, but should be one of " + f"`{self.valid_split_values}`" + ) + + if self.with_tables: + for table in document_tree["content"]["tables"]: + table_text, table_html = self._get_table(table) + yield Document( + page_content=table_text, + metadata={ + **table["metadata"], + "type": "table", + "text_as_html": table_html, + }, + ) + + for attachment in document_tree["attachments"]: + yield from self._split_document( + document_tree=attachment, + split=self.split, + additional_metadata={"type": "attachment"}, + ) + + def _get_table(self, table: dict) -> Tuple[str, str]: + """Get text and HTML representation of the table.""" + table_text = "" + for row in table["cells"]: + for cell in row: + table_text += " ".join(line["text"] for line in cell["lines"]) + table_text += "\t" + table_text += "\n" + + table_html = ( + '\n\n' + ) + for row in table["cells"]: + table_html += "\n" + for cell in row: + cell_text = "\n".join(line["text"] for line in cell["lines"]) + table_html += "{cell_text}\n' + ) + table_html += "\n" + table_html += "\n
" + + return table_text, table_html + + +class DedocFileLoader(DedocBaseLoader): + """ + Load files using `dedoc`. + + The file loader automatically detects the file type (with the correct extension). + The list of supported file types is gives at + https://dedoc.readthedocs.io/en/latest/index.html#id1. + Please see the documentation of DedocBaseLoader to get more details. + + Examples + -------- + ```python + from langchain_community.document_loaders import DedocFileLoader + + loader = DedocFileLoader( + "example.pdf", split="page", pdf_with_text_layer="tabby", pages=":10" + ) + docs = loader.load() + ``` + + References + ---------- + https://dedoc.readthedocs.io/en/latest/index.html#id1 + https://dedoc.readthedocs.io/en/latest/parameters/parameters.html + """ + + def _make_config(self) -> dict: + from dedoc.utils.langchain import make_manager_config + + return make_manager_config( + file_path=self.file_path, + parsing_params=self.parsing_parameters, + split=self.split, + ) + + +class DedocAPIFileLoader(DedocBaseLoader): + """ + Load files using `dedoc` API. + The file loader automatically detects the file type (even with the wrong extension). + By default, the loader makes a call to the locally hosted `dedoc` API. + You don't need to install `dedoc` library for using this loader. + Please see the documentation of DedocBaseLoader to get more details. + + Examples + -------- + ```python + from langchain_community.document_loaders import DedocAPIFileLoader + + loader = DedocAPIFileLoader( + "example.pdf", split="page", pdf_with_text_layer="tabby", pages=":10" + ) + docs = loader.load() + ``` + + References + ---------- + https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker + https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html + """ + + def __init__( + self, + file_path: str, + url: str = "http://0.0.0.0:1231", + split: str = "document", + with_tables: bool = True, + **dedoc_kwargs: Union[str, bool], + ) -> None: + """Initialize with file path, API url and parsing parameters. + + Args: + file_path: path to the file for processing + url: URL to call `dedoc` API + split: type of document splitting into parts (each part is returned + separately), default value "document" + "document": document is returned as a single langchain Document object + (don't split) + "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP) + "node": split document into tree nodes (title nodes, list item nodes, + raw text nodes) + "line": split document into lines + with_tables: add tables to the result - each table is returned as a single + langchain Document object + + dedoc_kwargs: parameters used for document parsing via `dedoc` + (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html). + with_attachments: enable attached files extraction + recursion_deep_attachments: recursion level for attached files + extraction, works only when with_attachments==True + pdf_with_text_layer: type of handler for parsing PDF documents, + available options + ["true", "false", "tabby", "auto", "auto_tabby" (default)] + language: language of the document for PDF without a textual layer and + images, available options ["eng", "rus", "rus+eng" (default)], + the list of languages can be extended, please see + https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html + pages: page slice to define the reading range for parsing PDF documents + is_one_column_document: detect number of columns for PDF without + a textual layer and images, available options + ["true", "false", "auto" (default)] + document_orientation: fix document orientation (90, 180, 270 degrees) + for PDF without a textual layer and images, available options + ["auto" (default), "no_change"] + need_header_footer_analysis: remove headers and footers from the output + result for parsing PDF and images + need_binarization: clean pages background (binarize) for PDF without a + textual layer and images + need_pdf_table_analysis: parse tables for PDF without a textual layer + and images + delimiter: column separator for CSV, TSV files + encoding: encoding of TXT, CSV, TSV + """ + super().__init__( + file_path=file_path, split=split, with_tables=with_tables, **dedoc_kwargs + ) + self.url = url + self.parsing_parameters["return_format"] = "json" + + def lazy_load(self) -> Iterator[Document]: + """Lazily load documents.""" + doc_tree = self._send_file( + url=self.url, file_path=self.file_path, parameters=self.parsing_parameters + ) + yield from self._split_document(document_tree=doc_tree, split=self.split) + + def _make_config(self) -> dict: + return {} + + def _send_file( + self, url: str, file_path: str, parameters: dict + ) -> Dict[str, Union[list, dict, str]]: + """Send POST-request to `dedoc` API and return the results""" + import json + import os + + import requests + + file_name = os.path.basename(file_path) + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post(f"{url}/upload", files=files, data=parameters) + + if r.status_code != 200: + raise ValueError(f"Error during file handling: {r.content.decode()}") + + result = json.loads(r.content.decode()) + return result diff --git a/docs/source/_static/code_examples/langchain/pdf.py b/docs/source/_static/code_examples/langchain/pdf.py new file mode 100644 index 00000000..336bfbcd --- /dev/null +++ b/docs/source/_static/code_examples/langchain/pdf.py @@ -0,0 +1,57 @@ +from dedoc_loader import DedocBaseLoader # noqa from langchain_community.document_loaders.dedoc import DedocBaseLoader + + +class DedocPDFLoader(DedocBaseLoader): + """ + Load PDF files using `dedoc`. + The file loader can automatically detect the correctness of a textual layer in the + PDF document. + Note that `__init__` method supports dedoc_kwargs that differ from ones of + DedocBaseLoader. + + dedoc_kwargs: parameters used for document parsing via `dedoc` + (https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html). + with_attachments: enable attached files extraction + recursion_deep_attachments: recursion level for attached files extraction, + works only when with_attachments==True + pdf_with_text_layer: type of handler for parsing, available options + ["true", "false", "tabby", "auto", "auto_tabby" (default)] + language: language of the document for PDF without a textual layer, + available options ["eng", "rus", "rus+eng" (default)], the list of + languages can be extended, please see + https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html + pages: page slice to define the reading range for parsing + is_one_column_document: detect number of columns for PDF without a textual + layer, available options ["true", "false", "auto" (default)] + document_orientation: fix document orientation (90, 180, 270 degrees) for PDF + without a textual layer, available options ["auto" (default), "no_change"] + need_header_footer_analysis: remove headers and footers from the output result + need_binarization: clean pages background (binarize) for PDF without a textual + layer + need_pdf_table_analysis: parse tables for PDF without a textual layer + + Examples + -------- + ```python + from langchain_community.document_loaders import DedocPDFLoader + + loader = DedocPDFLoader( + "example.pdf", split="page", pdf_with_text_layer="tabby", pages=":10" + ) + docs = loader.load() + ``` + + References + ---------- + https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html + https://dedoc.readthedocs.io/en/latest/modules/readers.html#dedoc.readers.PdfAutoReader + """ + + def _make_config(self) -> dict: + from dedoc.utils.langchain import make_manager_pdf_config + + return make_manager_pdf_config( + file_path=self.file_path, + parsing_params=self.parsing_parameters, + split=self.split, + ) diff --git a/docs/source/_static/code_examples/train_article_line_classifier.py b/docs/source/_static/code_examples/train_article_line_classifier.py index c8784e9b..bef75d0c 100644 --- a/docs/source/_static/code_examples/train_article_line_classifier.py +++ b/docs/source/_static/code_examples/train_article_line_classifier.py @@ -22,7 +22,7 @@ def skip_labels(label: str) -> Optional[str]: # configure path for saving a trained classifier classifier_directory_path = os.path.join(os.path.expanduser("~"), ".cache", "dedoc", "resources", "line_type_classifiers") os.makedirs(classifier_directory_path, exist_ok=True) -classifier_path = os.path.join(classifier_directory_path, f"{classifier_name}.pkl.gz") +classifier_path = os.path.join(classifier_directory_path, f"{classifier_name}.zip") # configure paths for saving scores and features importances (this is not obligatory) resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 3d911775..dd5cfcbc 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,13 @@ Changelog ========= +v2.2.5 (2024-07-15) +------------------- +Release note: `v2.2.5 `_ + +* Added internal functions and classes to support integration of Dedoc into `langchain `_ +* Upgrade some dependencies, in particular, `xgboost>=1.6.0`, `pandas`, `pdfminer.six` + v2.2.4 (2024-06-20) ------------------- Release note: `v2.2.4 `_ diff --git a/requirements.txt b/requirements.txt index 7d449f59..0f5d1b03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,22 +2,22 @@ beautifulsoup4>=4.10.0,<=4.12.2 charset-normalizer>=2.0.12,<=3.2.0 Cython>=0.29.28,<=3.0.2 dedoc-utils==0.3.6 -fastapi>=0.77.0,<=0.103.0 -huggingface-hub>=0.14.1,<=0.16.4 +fastapi>=0.77.0,<1.0 +huggingface-hub>=0.14.1,<1.0 imutils==0.5.4 itsdangerous>=2.1.0,<=2.1.2 numpy>=1.22.0,<=1.22.3 olefile~=0.46 opencv-python>=4.5.5.64,<4.6.0 -orjson>=3.8.11,<=3.9.5 -pandas>=1.4.1,<=1.9.0 +orjson>=3.8.11,<4.0 +pandas>=1.4.1,<3 pdf.tocgen>=1.3.0,<=1.3.4 pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c' -pdfminer.six==20211012 +pdfminer.six>=20211012,<=20231228 piexif==1.1.3 puremagic>=1.0,<2.0 # needs libmagic to be installed in the system pylzma==0.5.0 -pypdf==4.1.0 +pypdf>=3.17.0,<=4.1.0 PyPDF2==1.27.0 pytesseract==0.3.10 python-docx==0.8.11 @@ -31,12 +31,12 @@ roman>=3.3,<4.0 scikit-image>=0.19.3,<=0.21.0 scikit_learn>=1.0.2,<=1.2.2 scipy>=1.8.0,<=1.11.2 -six==1.14.0 +six>=1.14.0,<2.0 starlette>=0.26.1,<=0.27.0 -texttable==1.6.7 +texttable>=1.6.7,<2.0 ujson>=5.4.0,<=5.8.0 uvicorn>=0.18.0,<=0.23.2 wget==3.2 -xgbfir==0.3.1 -xgboost>=1.1.1,<1.2.0 -xlrd==1.2.0 \ No newline at end of file +xgbfir>=0.3.1,<1.0 +xgboost>=1.6.0,<2.0 # lower versions aren't compatible with pandas>2 +xlrd>=1.2.0,<2.0 diff --git a/scripts/train/train_diploma_line_classifier.py b/scripts/train/train_diploma_line_classifier.py index dfc1695f..44f304a3 100644 --- a/scripts/train/train_diploma_line_classifier.py +++ b/scripts/train/train_diploma_line_classifier.py @@ -23,7 +23,7 @@ def skip_labels(label: str) -> Optional[str]: resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) -path_out = os.path.join(clf_resources_path, f"{classifier_name}.pkl.gz") +path_out = os.path.join(clf_resources_path, f"{classifier_name}.zip") path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") diff --git a/scripts/train/train_law_line_classifier.py b/scripts/train/train_law_line_classifier.py index c9fafcff..0f7b5797 100644 --- a/scripts/train/train_law_line_classifier.py +++ b/scripts/train/train_law_line_classifier.py @@ -33,7 +33,7 @@ def transform_labels(label: str) -> Optional[str]: resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) classifier_name = "law_txt_classifier" if txt_classifier else "law_classifier" -path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz") +path_out = os.path.join(resources_path, f"{classifier_name}.zip") path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") diff --git a/scripts/train/train_paragraph_classifier.py b/scripts/train/train_paragraph_classifier.py index 30fd30ca..6f3d64d7 100644 --- a/scripts/train/train_paragraph_classifier.py +++ b/scripts/train/train_paragraph_classifier.py @@ -17,7 +17,7 @@ def skip_labels(label: str) -> Optional[str]: resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) -path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz") +path_out = os.path.join(resources_path, f"{classifier_name}.zip") path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") diff --git a/scripts/train/train_txtlayer_classifier.py b/scripts/train/train_txtlayer_classifier.py index ed3deaa4..50596ac8 100644 --- a/scripts/train/train_txtlayer_classifier.py +++ b/scripts/train/train_txtlayer_classifier.py @@ -1,6 +1,4 @@ -import gzip import os -import pickle import zipfile from pathlib import Path from typing import List, Tuple @@ -92,8 +90,7 @@ def get_texts_and_targets(self) -> Tuple[List[str], List[int]]: print(f"F1 score = {score}") resources_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "..", "resources") - with gzip.open(os.path.join(resources_dir, "txtlayer_classifier.pkl.gz"), "wb") as file: - pickle.dump(clf, file) + clf.save_model(os.path.join(resources_dir, "txtlayer_classifier.json")) xgbfir.saveXgbFI(clf, feature_names=features.columns, diff --git a/scripts/train/train_tz_line_classifier.py b/scripts/train/train_tz_line_classifier.py index ad7da326..7fba562d 100644 --- a/scripts/train/train_tz_line_classifier.py +++ b/scripts/train/train_tz_line_classifier.py @@ -19,7 +19,7 @@ def skip_labels(label: str) -> Optional[str]: resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) assert os.path.isdir(resources_path) -path_out = os.path.join(resources_path, f"{classifier_name}.pkl.gz") +path_out = os.path.join(resources_path, f"{classifier_name}.zip") path_scores = os.path.join(resources_path, "benchmarks", f"{classifier_name}_scores.json") path_feature_importances = os.path.join(resources_path, "feature_importances", f"{classifier_name}_feature_importances.xlsx") diff --git a/scripts/train/trainers/base_sklearn_line_classifier.py b/scripts/train/trainers/base_sklearn_line_classifier.py index eee046fb..53beba6b 100644 --- a/scripts/train/trainers/base_sklearn_line_classifier.py +++ b/scripts/train/trainers/base_sklearn_line_classifier.py @@ -1,10 +1,8 @@ import abc -import gzip import hashlib import json import logging import os -import pickle from collections import Counter, OrderedDict from statistics import mean from typing import Callable, List, Optional @@ -16,6 +14,7 @@ from xgboost import XGBClassifier from dedoc.structure_extractors.feature_extractors.abstract_extractor import AbstractFeatureExtractor +from dedoc.structure_extractors.line_type_classifiers.abstract_pickled_classifier import AbstractPickledLineTypeClassifier from dedoc.utils.utils import flatten, identity from scripts.train.trainers.data_loader import DataLoader from scripts.train.trainers.dataset import LineClassifierDataset @@ -90,10 +89,6 @@ def __init__(self, self.errors_saver = ErrorsSaver(self.path_errors, os.path.join(self.dataset_dir, "dataset.zip"), logger, config=config) self.path_features_importances = path_features_importances self.label_transformer = identity if label_transformer is None else label_transformer - - if not path_out.endswith(".pkl.gz"): - path_out = path_out + ".gz" if path_out.endswith(".pkl") else path_out + ".pkl.gz" - self.path_out = path_out self.config = config self.n_splits = n_splits @@ -131,8 +126,7 @@ def fit(self, no_cache: bool = False, cross_val_only: bool = False, save_dataset if not os.path.isdir(os.path.dirname(self.path_out)): os.makedirs(os.path.dirname(self.path_out)) - with gzip.open(self.path_out, "wb") as output_file: - pickle.dump((cls, self.feature_extractor.parameters()), output_file) + AbstractPickledLineTypeClassifier.save(path_out=self.path_out, classifier=cls, parameters=self.feature_extractor.parameters()) if self.path_scores is not None: self.logger.info(f"Save scores in {self.path_scores}") diff --git a/tests/Dockerfile b/tests/Dockerfile new file mode 100644 index 00000000..8ae74f31 --- /dev/null +++ b/tests/Dockerfile @@ -0,0 +1,17 @@ +ARG REPOSITORY="docker.io" +FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 + +ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root:/dedoc_root/tests:/dedoc_root/langchain" + +COPY requirements.txt . +RUN pip3 install --no-cache-dir -r requirements.txt +RUN pip3 install "langchain-community<1.0" + +RUN mkdir /dedoc_root +COPY docs/source/_static/code_examples/langchain /dedoc_root/langchain +COPY dedoc /dedoc_root/dedoc +COPY VERSION /dedoc_root +RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py +COPY tests /dedoc_root/tests + +CMD [ "bash", "/dedoc_root/tests/run_tests_in_docker.sh" ] diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py index b133a6f9..792711c1 100644 --- a/tests/api_tests/test_api_format_pdf_with_text.py +++ b/tests/api_tests/test_api_format_pdf_with_text.py @@ -23,8 +23,8 @@ def test_ref_tables(self) -> None: result = self._send_request("example.pdf", dict(pdf_with_text_layer="true")) tables_uids = [table["metadata"]["uid"] for table in result["content"]["tables"]] self.assertEqual(len(tables_uids), 2) - ref0 = self.__extract_node_with_annotation(result, "0.2.2", "table")[0]["value"] - ref1 = self.__extract_node_with_annotation(result, "0.2.2.0", "table")[0]["value"] + ref0 = self.__extract_node_with_annotation(result, "0.1.2", "table")[0]["value"] + ref1 = self.__extract_node_with_annotation(result, "0.1.2.0", "table")[0]["value"] self.assertEqual(ref0, tables_uids[0]) self.assertEqual(ref1, tables_uids[1]) @@ -110,17 +110,17 @@ def test_pdf_with_2_columns_text(self) -> None: self.assertIn("Privacy of users in P2P networks goes far beyond their\n" "current usage and is a fundamental requirement to the adop-\n" "tion of P2P protocols for legal usage. In a climate of cold", - self._get_by_tree_path(tree, "0.4.1.2")["text"]) + self._get_by_tree_path(tree, "0.6.1.2")["text"]) - self.assertIn("Keywords", self._get_by_tree_path(tree, "0.4.1.3")["text"]) - self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.4.1.4")["text"]) + self.assertIn("Keywords", self._get_by_tree_path(tree, "0.6.1.3")["text"]) + self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.6.1.4")["text"]) - self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0")["text"]) + self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.7.0")["text"]) self.assertIn("The Tor network was designed to provide freedom\n" "of speech by guaranteeing anonymous communications.\n" "Whereas the cryptographic foundations of Tor, based on\n" "onion-routing [3, 9, 22, 24], are known to be robust, identity", - self._get_by_tree_path(tree, "0.5.0.0")["text"]) + self._get_by_tree_path(tree, "0.7.0.0")["text"]) def test_pdf_with_2_columns_text_2(self) -> None: file_name = "liters_state.pdf" diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py index 737adfb8..719040e7 100644 --- a/tests/api_tests/test_api_misc_with_images_refs.py +++ b/tests/api_tests/test_api_misc_with_images_refs.py @@ -71,11 +71,11 @@ def test_pdf_pdfminer_images_refs(self) -> None: self.assertEqual(attach_annotation["name"], "attachment") self.assertIn(attach_annotation["value"], attachment_uids) - attach_annotation = structure["subparagraphs"][3]["annotations"][-2] + attach_annotation = structure["subparagraphs"][2]["annotations"][-2] self.assertEqual(attach_annotation["name"], "attachment") self.assertIn(attach_annotation["value"], attachment_uids) - attach_annotation = structure["subparagraphs"][3]["annotations"][-1] + attach_annotation = structure["subparagraphs"][2]["annotations"][-1] self.assertEqual(attach_annotation["name"], "attachment") self.assertIn(attach_annotation["value"], attachment_uids) diff --git a/tests/unit_tests/test_misc_langchain_document_loader.py b/tests/unit_tests/test_misc_langchain_document_loader.py new file mode 100644 index 00000000..50035c9f --- /dev/null +++ b/tests/unit_tests/test_misc_langchain_document_loader.py @@ -0,0 +1,90 @@ +import os +import unittest + +from dedoc.dedoc_manager import DedocManager +from dedoc.utils.langchain import make_manager_config, make_manager_pdf_config + + +class TestLangchainDocumentLoader(unittest.TestCase): + test_folder_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "data") + test_files = [ + "archives/zipka.zip", "archives/zipka.tar", "archives/zipka.rar", "archives/zipka.7z", + "csvs/csv_coma.csv", "csvs/csv_tab.tsv", + "docx/english_doc.docx", "docx/english_doc.doc", "docx/english_doc.odt", "docx/english_doc.rtf", + "xlsx/example.xlsx", "xlsx/example.xls", "xlsx/example.ods", + "pptx/example.pptx", "pptx/example.ppt", "pptx/example.odp", + "htmls/example.html", "eml/message.eml", "mhtml/with_attachments.mhtml", + "json/example2.json", "txt/example.txt", "xml/simple.xml", + "scanned/example.png", "scanned/example.pdf", "scanned/example.jpg", "scanned/example_with_table7.djvu", + "pdf_auto/mixed_pdf.pdf", "pdf_with_text_layer/example.pdf", + ] + + def test_make_manager_config(self) -> None: + for file in self.test_files: + manager_config = make_manager_config(file_path=os.path.join(self.test_folder_path, file), split="node", parsing_params={}) + manager = DedocManager(manager_config=manager_config) + manager.parse(file_path=os.path.join(self.test_folder_path, file)) + + def test_make_manager_pdf_config(self) -> None: + pdf_file_path = os.path.join(self.test_folder_path, "pdf_auto", "mixed_pdf.pdf") + for pdf_with_text_layer in ("true", "tabby", "false", "auto", "auto_tabby"): + manager_config = make_manager_pdf_config(file_path=pdf_file_path, split="node", parsing_params=dict(pdf_with_text_layer=pdf_with_text_layer)) + manager = DedocManager(manager_config=manager_config) + manager.parse(file_path=pdf_file_path, parameters=dict(pdf_with_text_layer=pdf_with_text_layer)) + + def test_dedoc_file_loader(self) -> None: + from dedoc_loader import DedocFileLoader + + for file in self.test_files: + loader = DedocFileLoader(os.path.join(self.test_folder_path, file), split="document", with_tables=False) + docs = loader.load() + self.assertEqual(1, len(docs)) + + def test_dedoc_api_loader(self) -> None: + from dedoc_loader import DedocAPIFileLoader + + dedoc_url = f"http://{os.environ.get('DOC_READER_HOST', '0.0.0.0')}:1231" + for file in self.test_files: + loader = DedocAPIFileLoader(os.path.join(self.test_folder_path, file), url=dedoc_url, split="document", with_tables=False) + docs = loader.load() + self.assertEqual(1, len(docs)) + + def test_dedoc_pdf_loader(self) -> None: + from pdf import DedocPDFLoader + + pdf_file_path = os.path.join(self.test_folder_path, "pdf_auto", "mixed_pdf.pdf") + for mode in ("true", "tabby", "false", "auto", "auto_tabby"): + loader = DedocPDFLoader(pdf_file_path, split="document", with_tables=False, pdf_with_text_layer=mode) + docs = loader.load() + self.assertEqual(1, len(docs)) + + def test_dedoc_base_loader(self) -> None: + from dedoc_loader import DedocFileLoader + + file_path = os.path.join(self.test_folder_path, "with_attachments", "example_with_attachments_depth_1.pdf") + + for split in ("line", "page", "node"): + loader = DedocFileLoader(file_path, split=split, with_tables=False) + docs = loader.load() + if split == "page": + self.assertEqual(1, len(docs)) + else: + self.assertGreater(len(docs), 1) + + loader = DedocFileLoader( + file_path, split="document", with_tables=True, with_attachments=True, need_content_analysis=True, need_pdf_table_analysis=False + ) + text_docs, table_docs, attachment_docs = [], [], [] + for doc in loader.load(): + doc_type = doc.metadata.get("type", "") + if doc_type == "table": + table_docs.append(doc) + self.assertIn("text_as_html", doc.metadata) + elif doc_type == "attachment": + attachment_docs.append(doc) + else: + text_docs.append(doc) + + self.assertEqual(1, len(text_docs)) + self.assertEqual(1, len(table_docs)) + self.assertEqual(5, len(attachment_docs))