diff --git a/.flake8 b/.flake8 index 804c52e0..401f544b 100644 --- a/.flake8 +++ b/.flake8 @@ -49,3 +49,4 @@ per-file-ignores = scripts/benchmark_pdf_performance*:JS101 tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802 docs/source/_static/code_examples/*:I251 + docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251 diff --git a/Dockerfile b/Dockerfile index 3d00dea6..cdef9746 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,25 +1,22 @@ ARG REPOSITORY="docker.io" FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 ARG LANGUAGES="" -RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done +RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$(echo $lang | tr "_" "-"); done ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root" ENV RESOURCES_PATH "/dedoc_root/resources" -ADD requirements.txt . +COPY requirements.txt . RUN pip3 install --no-cache-dir -r requirements.txt RUN mkdir /dedoc_root RUN mkdir /dedoc_root/dedoc -ADD dedoc/config.py /dedoc_root/dedoc/config.py -ADD dedoc/download_models.py /dedoc_root/dedoc/download_models.py +COPY dedoc/config.py /dedoc_root/dedoc/config.py +COPY dedoc/download_models.py /dedoc_root/dedoc/download_models.py RUN python3 /dedoc_root/dedoc/download_models.py -ADD dedoc /dedoc_root/dedoc -ADD VERSION /dedoc_root +COPY dedoc /dedoc_root/dedoc +COPY VERSION /dedoc_root RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py -ADD tests /dedoc_root/tests -ADD resources /dedoc_root/resources - -CMD ["python3", "/dedoc_root/dedoc/main.py"] +CMD [ "python3", "/dedoc_root/dedoc/main.py" ] diff --git a/VERSION b/VERSION index 04761555..ecf00d90 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.4 \ No newline at end of file +2.2.5 \ No newline at end of file diff --git a/dedoc/download_models.py b/dedoc/download_models.py index 7fa611bd..fa209b14 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -5,12 +5,12 @@ Keys are the names of repositories with models. """ model_hash_dict = dict( - txtlayer_classifier="94e27e184fa2876883d260e0aa58b042e6ab3e35", + txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f", scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58", font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07", - paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864", - line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013", - fintoc_classifiers="42f8ada99a5da608139b078c93bebfffc5b30263" + paragraph_classifier="c26a10193499d3cbc77ffec9842bece24fa8950b", + line_type_classifiers="0568c6e1f49612c0c351f10b80a26dc05f796683", + fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8" ) @@ -27,29 +27,29 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str def download(resources_path: str) -> None: import os - download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz") + download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.json", repo_name="txtlayer_classifier", hub_name="model.json") download_from_hub(out_dir=resources_path, out_name="scan_orientation_efficient_net_b0.pth", repo_name="scan_orientation_efficient_net_b0", hub_name="model.pth") - download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.pkl.gz", repo_name="paragraph_classifier", hub_name="model.pkl.gz") + download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.zip", repo_name="paragraph_classifier", hub_name="model.zip") line_clf_resources_path = os.path.join(resources_path, "line_type_classifiers") for classifier_type in ("diploma", "law", "law_txt", "tz", "tz_txt"): download_from_hub(out_dir=line_clf_resources_path, - out_name=f"{classifier_type}_classifier.pkl.gz", + out_name=f"{classifier_type}_classifier.zip", repo_name="line_type_classifiers", - hub_name=f"{classifier_type}.pkl.gz") + hub_name=f"{classifier_type}.zip") fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers") for language in ("en", "fr", "sp"): for classifier_type in ("target", "binary"): download_from_hub(out_dir=fintoc_classifiers_resources_path, - out_name=f"{classifier_type}_classifier_{language}.pkg.gz", + out_name=f"{classifier_type}_classifier_{language}.json", repo_name="fintoc_classifiers", - hub_name=f"{classifier_type}_classifier_{language}_txt_layer.pkg.gz") + hub_name=f"{classifier_type}_classifier_{language}_txt_layer.json") if __name__ == "__main__": diff --git a/dedoc/extensions.py b/dedoc/extensions.py index 069642e0..817e1305 100644 --- a/dedoc/extensions.py +++ b/dedoc/extensions.py @@ -19,7 +19,7 @@ converted_extensions = Extensions( - excel_like_format={".ods", "xls"}, + excel_like_format={".ods", ".xls"}, docx_like_format={".odt", ".doc", ".rtf"}, pptx_like_format={".odp", ".ppt"}, html_like_format={}, diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py index 8d3ec876..97cefe5b 100644 --- a/dedoc/readers/email_reader/email_reader.py +++ b/dedoc/readers/email_reader/email_reader.py @@ -44,11 +44,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure import os import uuid from dedoc.data_structures.attached_file import AttachedFile - from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments from dedoc.utils.utils import get_unique_name parameters = {} if parameters is None else parameters attachments_dir = get_param_attachments_dir(parameters, file_path) + with_attachments = get_param_with_attachments(parameters) + need_content_analysis = get_param_need_content_analysis(parameters) with open(file_path, "rb") as f: msg = email.message_from_binary_file(f) @@ -58,16 +60,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure lines = self.__get_main_fields(msg) header_filename = "message_header_" + get_unique_name("message_header.json") - # saving message header into separated file as an attachment - header_file_path = os.path.join(attachments_dir, header_filename) - with open(header_file_path, "w", encoding="utf-8") as f: - json.dump(all_header_fields, f, ensure_ascii=False, indent=4) - - need_content_analysis = get_param_need_content_analysis(parameters) - attachments.append(AttachedFile(original_name=header_filename, - tmp_file_path=header_file_path, - uid=f"attach_{uuid.uuid1()}", - need_content_analysis=need_content_analysis)) + if with_attachments: + # saving message header into separated file as an attachment + header_file_path = os.path.join(attachments_dir, header_filename) + with open(header_file_path, "w", encoding="utf-8") as f: + json.dump(all_header_fields, f, ensure_ascii=False, indent=4) + attachments.append(AttachedFile(original_name=header_filename, + tmp_file_path=header_file_path, + uid=f"attach_{uuid.uuid1()}", + need_content_analysis=need_content_analysis)) html_found = False text_parts = [] @@ -92,7 +93,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure if part.is_multipart(): continue - self.__add_attachment(part, attachments_dir, attachments, need_content_analysis) + if with_attachments: + self.__add_attachment(part, attachments_dir, attachments, need_content_analysis) # text/plain has the same content as text/html if not html_found: diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py index 7073ee54..0e87feb5 100644 --- a/dedoc/readers/mhtml_reader/mhtml_reader.py +++ b/dedoc/readers/mhtml_reader/mhtml_reader.py @@ -36,7 +36,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ - from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis + from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments parameters = {} if parameters is None else parameters attachments_dir = get_param_attachments_dir(parameters, file_path) @@ -51,7 +51,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure lines.extend(result.lines) tables.extend(result.tables) - need_content_analysis = get_param_need_content_analysis(parameters) tmp_file_names = [] original_file_names = [] for tmp_file_name, original_file_name in zip(names_list, original_names_list): @@ -59,8 +58,14 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure tmp_file_names.append(tmp_file_name) original_file_names.append(original_file_name) - attachments = self.__get_attachments(save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names, - need_content_analysis=need_content_analysis) + with_attachments = get_param_with_attachments(parameters) + need_content_analysis = get_param_need_content_analysis(parameters) + if with_attachments: + attachments = self.__get_attachments( + save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names, need_content_analysis=need_content_analysis + ) + else: + attachments = [] return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py index 21716598..31fb7b9b 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py @@ -1,7 +1,5 @@ -import gzip import logging import os -import pickle from typing import List from xgboost import XGBClassifier @@ -22,7 +20,7 @@ def __init__(self, *, config: dict) -> None: self.logger = config.get("logger", logging.getLogger()) self.feature_extractor = TxtlayerFeatureExtractor() - self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.pkl.gz") + self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.json") self.__model = None @property @@ -32,11 +30,11 @@ def __get_model(self) -> XGBClassifier: if not os.path.isfile(self.path): out_dir, out_name = os.path.split(self.path) - download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.pkl.gz") + download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.json") assert os.path.isfile(self.path) - with gzip.open(self.path, "rb") as f: - self.__model = pickle.load(f) + self.__model = XGBClassifier() + self.__model.load_model(self.path) if get_param_gpu_available(self.config, self.logger): gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py index 76d37062..ba6c359c 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py @@ -1,7 +1,8 @@ -import gzip +import json import logging import os -import pickle +import tempfile +import zipfile from typing import List from xgboost import XGBClassifier @@ -21,7 +22,7 @@ class ScanParagraphClassifierExtractor(object): def __init__(self, *, config: dict) -> None: super().__init__() self.logger = config.get("logger", logging.getLogger()) - self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.pkl.gz") + self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.zip") self.config = config self._feature_extractor = None self._classifier = None @@ -41,11 +42,17 @@ def classifier(self) -> XGBClassifier: def _unpickle(self) -> None: if not os.path.isfile(self.path): out_dir, out_name = os.path.split(self.path) - download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.pkl.gz") + download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.zip") - with gzip.open(self.path) as file: - self._classifier, parameters = pickle.load(file) - self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config) + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(self.path) as archive: + archive.extractall(tmpdir) + + with open(os.path.join(tmpdir, "parameters.json")) as parameters_file: + parameters = json.load(parameters_file) + self._classifier = XGBClassifier() + self._classifier.load_model(os.path.join(tmpdir, "classifier.json")) + self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config) if get_param_gpu_available(self.config, self.logger): gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0) diff --git a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py index 16b7794d..0638e00c 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py @@ -26,8 +26,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None: from dedoc.structure_extractors.line_type_classifiers.law_classifier import LawLineTypeClassifier path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=self.config) - self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=self.config) + self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.zip"), config=self.config) + self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.zip"), config=self.config) self.hierarchy_level_builders = [StubHierarchyLevelBuilder()] self.hl_type = "law" self.init_hl_depth = 1 diff --git a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py index 28508d50..591a1c16 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py @@ -32,7 +32,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.toc_builder = TocBuilder() self.body_builder = DiplomaBodyBuilder() path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=self.config) + self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.zip"), config=self.config) self.footnote_start_regexp = re.compile(r"^\d+ ") def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: diff --git a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py index 15cc3d86..0429bbab 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py @@ -29,8 +29,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None: self.body_builder = TzBodyBuilder() self.toc_builder = TocBuilder() path = os.path.join(get_config()["resources_path"], "line_type_classifiers") - self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=self.config) - self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=self.config) + self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.zip"), config=self.config) + self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.zip"), config=self.config) def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ diff --git a/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py b/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py index e5e62aef..1d79b29c 100644 --- a/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py @@ -1,7 +1,8 @@ -import gzip +import json import logging import os -import pickle +import tempfile +import zipfile from abc import ABC from typing import Optional, Tuple @@ -32,10 +33,16 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]: """ if not os.path.isfile(path): out_dir, out_name = os.path.split(path) - download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.pkl.gz") + download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.zip") - with gzip.open(path) as file: - classifier, feature_extractor_parameters = pickle.load(file) + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(path) as archive: + archive.extractall(tmpdir) + + with open(os.path.join(tmpdir, "parameters.json")) as parameters_file: + feature_extractor_parameters = json.load(parameters_file) + classifier = XGBClassifier() + classifier.load_model(os.path.join(tmpdir, "classifier.json")) if get_param_gpu_available(self.config, self.logger): gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0) @@ -44,19 +51,27 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]: return classifier, feature_extractor_parameters - def save(self, path_out: str, object_for_saving: object) -> str: + @staticmethod + def save(path_out: str, classifier: XGBClassifier, parameters: dict) -> str: """ - Save the pickled classifier (with initialization parameters for a feature extractor) into the `.pkl.gz` file with path=`path_out` + Save the classifier (with initialization parameters for a feature extractor) into the `.zip` file with path=`path_out` + + * classifier -> classifier.json + * parameters -> parameters.json :param path_out: path (with file name) where to save the object - :param object_for_saving: classifier with feature extractor's parameters to save + :param classifier: classifier to save + :param parameters: feature extractor parameters to save :return: the resulting path of the saved file """ - if path_out.endswith(".pkl"): - path_out += ".gz" - elif not path_out.endswith(".gz"): - path_out += ".pkl.gz" + with tempfile.TemporaryDirectory() as tmpdir: + clf_path = os.path.join(tmpdir, "classifier.json") + params_path = os.path.join(tmpdir, "parameters.json") + classifier.save_model(clf_path) + with open(params_path, "w") as out_file: + json.dump(parameters, out_file) - with gzip.open(path_out, "wb") as file_out: - pickle.dump(obj=object_for_saving, file=file_out) + with zipfile.ZipFile(path_out, "w") as archive: + archive.write(clf_path, os.path.basename(clf_path)) + archive.write(params_path, os.path.basename(params_path)) return path_out diff --git a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py index 9e00e819..43c7100b 100755 --- a/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py +++ b/dedoc/structure_extractors/line_type_classifiers/fintoc_classifier.py @@ -1,7 +1,5 @@ -import gzip import logging import os -import pickle from typing import Dict, List, Optional, Union import numpy as np @@ -60,8 +58,7 @@ def fit(self, def save(self, classifiers_dir_path: str, features_importances_dir_path: str, logger: logging.Logger, features_names: List[str], reader: str) -> None: os.makedirs(classifiers_dir_path, exist_ok=True) for classifier_type in ("binary", "target"): - with gzip.open(os.path.join(classifiers_dir_path, f"{classifier_type}_classifier_{self.language}_{reader}.pkg.gz"), "wb") as output_file: - pickle.dump(self.classifiers[classifier_type], output_file) + self.classifiers[classifier_type].save_model(os.path.join(classifiers_dir_path, f"{classifier_type}_classifier_{self.language}_{reader}.json")) logger.info(f"Classifiers were saved in {classifiers_dir_path} directory") os.makedirs(features_importances_dir_path, exist_ok=True) @@ -81,15 +78,16 @@ def target_classifier(self) -> XGBClassifier: def __lazy_load_weights(self, classifier_type: str) -> XGBClassifier: if self.classifiers[classifier_type] is None: assert self.weights_dir_path is not None - file_name = f"{classifier_type}_classifier_{self.language}.pkg.gz" + file_name = f"{classifier_type}_classifier_{self.language}.json" classifier_path = os.path.join(self.weights_dir_path, file_name) if not os.path.isfile(classifier_path): download_from_hub(out_dir=self.weights_dir_path, out_name=file_name, repo_name="fintoc_classifiers", - hub_name=f"{classifier_type}_classifier_{self.language}_txt_layer.pkg.gz") + hub_name=f"{classifier_type}_classifier_{self.language}_txt_layer.json") - with gzip.open(classifier_path, "rb") as input_file: - self.classifiers[classifier_type] = pickle.load(file=input_file) + classifier = XGBClassifier() + classifier.load_model(classifier_path) + self.classifiers[classifier_type] = classifier return self.classifiers[classifier_type] diff --git a/dedoc/utils/langchain.py b/dedoc/utils/langchain.py new file mode 100644 index 00000000..a006ede4 --- /dev/null +++ b/dedoc/utils/langchain.py @@ -0,0 +1,239 @@ +from dedoc.extensions import converted_extensions, recognized_extensions + + +supported_extensions = { + format_group: {*recognized_extensions._asdict()[format_group], *converted_extensions._asdict()[format_group]} + for format_group in recognized_extensions._asdict().keys() +} + + +def make_manager_config(file_path: str, split: str, parsing_params: dict) -> dict: # noqa: C901 + from dedoc.utils.parameter_utils import get_param_with_attachments + from dedoc.utils.utils import get_mime_extension + from dedoc.common.exceptions.bad_file_error import BadFileFormatError + + if get_param_with_attachments(parsing_params): + return make_minimal_manager_config(split, parsing_params) + + mime, extension = get_mime_extension(file_path=file_path) + + if extension in supported_extensions["excel_like_format"]: + from dedoc.converters.concrete_converters.excel_converter import ExcelConverter + from dedoc.readers.excel_reader.excel_reader import ExcelReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = ExcelConverter(), ExcelReader(), BaseMetadataExtractor() + elif extension in supported_extensions["docx_like_format"]: + from dedoc.converters.concrete_converters.docx_converter import DocxConverter + from dedoc.readers.docx_reader.docx_reader import DocxReader + from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor + converter, reader, metadata_extractor = DocxConverter(), DocxReader(), DocxMetadataExtractor() + elif extension in supported_extensions["pptx_like_format"]: + from dedoc.converters.concrete_converters.pptx_converter import PptxConverter + from dedoc.readers.pptx_reader.pptx_reader import PptxReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = PptxConverter(), PptxReader(), BaseMetadataExtractor() + elif extension in supported_extensions["html_like_format"]: + from dedoc.readers.html_reader.html_reader import HtmlReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, HtmlReader(), BaseMetadataExtractor() + elif extension in supported_extensions["eml_like_format"]: + from dedoc.readers.email_reader.email_reader import EmailReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, EmailReader(), BaseMetadataExtractor() + elif extension in supported_extensions["mhtml_like_format"]: + from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, MhtmlReader(), BaseMetadataExtractor() + elif extension in supported_extensions["archive_like_format"]: + from dedoc.readers.archive_reader.archive_reader import ArchiveReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, ArchiveReader(), BaseMetadataExtractor() + elif extension in supported_extensions["image_like_format"]: + from dedoc.converters.concrete_converters.png_converter import PNGConverter + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor + converter, reader, metadata_extractor = PNGConverter(), PdfImageReader(), ImageMetadataExtractor() + elif extension in supported_extensions["pdf_like_format"]: + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params) + if pdf_with_text_layer == "true": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + converter, reader, metadata_extractor = PDFConverter(), PdfTxtlayerReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "tabby": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + converter, reader, metadata_extractor = PDFConverter(), PdfTabbyReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "false": + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + converter, reader, metadata_extractor = PDFConverter(), PdfImageReader(), PdfMetadataExtractor() + else: + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + converter, reader, metadata_extractor = PDFConverter(), PdfAutoReader(), PdfMetadataExtractor() + elif extension in supported_extensions["csv_like_format"]: + from dedoc.readers.csv_reader.csv_reader import CSVReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, CSVReader(), BaseMetadataExtractor() + elif extension in supported_extensions["txt_like_format"]: + from dedoc.converters.concrete_converters.txt_converter import TxtConverter + from dedoc.readers.txt_reader.raw_text_reader import RawTextReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = TxtConverter(), RawTextReader(), BaseMetadataExtractor() + elif extension in supported_extensions["json_like_format"]: + from dedoc.readers.json_reader.json_reader import JsonReader + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + converter, reader, metadata_extractor = None, JsonReader(), BaseMetadataExtractor() + else: + raise BadFileFormatError(f'Could not find the suitable reader for the file with mime = "{mime}", extension = "{extension}".') + + if split == "node": + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor() + else: + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor() + + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + + manager_config = dict( + converter=ConverterComposition(converters=[converter] if converter else []), + reader=ReaderComposition(readers=[reader]), + structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"), + structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor), + document_metadata_extractor=MetadataExtractorComposition(extractors=[metadata_extractor]), + attachments_handler=AttachmentsHandler() + ) + return manager_config + + +def make_manager_pdf_config(file_path: str, split: str, parsing_params: dict) -> dict: # noqa: C901 + from dedoc.utils.parameter_utils import get_param_with_attachments + from dedoc.utils.utils import get_mime_extension + from dedoc.common.exceptions.bad_file_error import BadFileFormatError + + if get_param_with_attachments(parsing_params): + return make_minimal_manager_config(split, parsing_params) + + mime, extension = get_mime_extension(file_path=file_path) + + if extension in supported_extensions["pdf_like_format"]: + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params) + if pdf_with_text_layer == "true": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + converter, reader, metadata_extractor = PDFConverter(), PdfTxtlayerReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "tabby": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + converter, reader, metadata_extractor = PDFConverter(), PdfTabbyReader(), PdfMetadataExtractor() + elif pdf_with_text_layer == "false": + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + converter, reader, metadata_extractor = PDFConverter(), PdfImageReader(), PdfMetadataExtractor() + else: + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + converter, reader, metadata_extractor = PDFConverter(), PdfAutoReader(), PdfMetadataExtractor() + else: + raise BadFileFormatError(f'Could not find the suitable reader for the file with mime = "{mime}", extension = "{extension}".') # noqa: T201 + + if split == "node": + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor() + else: + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor() + + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + + manager_config = dict( + converter=ConverterComposition(converters=[converter]), + reader=ReaderComposition(readers=[reader]), + structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"), + structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor), + document_metadata_extractor=MetadataExtractorComposition(extractors=[metadata_extractor]), + attachments_handler=AttachmentsHandler() + ) + return manager_config + + +def make_minimal_manager_config(split: str, parsing_params: dict) -> dict: # noqa: C901 + from dedoc.attachments_handler.attachments_handler import AttachmentsHandler + from dedoc.converters.concrete_converters.binary_converter import BinaryConverter + from dedoc.converters.concrete_converters.docx_converter import DocxConverter + from dedoc.converters.concrete_converters.excel_converter import ExcelConverter + from dedoc.converters.concrete_converters.pdf_converter import PDFConverter + from dedoc.converters.concrete_converters.png_converter import PNGConverter + from dedoc.converters.concrete_converters.pptx_converter import PptxConverter + from dedoc.converters.concrete_converters.txt_converter import TxtConverter + from dedoc.converters.converter_composition import ConverterComposition + from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.note_metadata_extarctor import NoteMetadataExtractor + from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor + from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition + from dedoc.readers.archive_reader.archive_reader import ArchiveReader + from dedoc.readers.csv_reader.csv_reader import CSVReader + from dedoc.readers.docx_reader.docx_reader import DocxReader + from dedoc.readers.email_reader.email_reader import EmailReader + from dedoc.readers.excel_reader.excel_reader import ExcelReader + from dedoc.readers.html_reader.html_reader import HtmlReader + from dedoc.readers.json_reader.json_reader import JsonReader + from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader + from dedoc.readers.note_reader.note_reader import NoteReader + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.readers.pptx_reader.pptx_reader import PptxReader + from dedoc.readers.reader_composition import ReaderComposition + from dedoc.readers.txt_reader.raw_text_reader import RawTextReader + from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition + from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition + from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer + + converters = [DocxConverter(), ExcelConverter(), PptxConverter(), TxtConverter(), PDFConverter(), PNGConverter(), BinaryConverter()] + readers = [] + pdf_with_text_layer = get_param_pdf_with_txt_layer(parsing_params) + if pdf_with_text_layer == "true": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + readers.append(PdfTxtlayerReader()) + elif pdf_with_text_layer == "tabby": + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader + readers.append(PdfTabbyReader()) + elif pdf_with_text_layer != "false": + from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader + readers.append(PdfAutoReader()) + + readers.extend([ + DocxReader(), ExcelReader(), PptxReader(), RawTextReader(), CSVReader(), HtmlReader(), NoteReader(), JsonReader(), ArchiveReader(), PdfImageReader(), + EmailReader(), MhtmlReader() + ]) + + metadata_extractors = [DocxMetadataExtractor(), PdfMetadataExtractor(), ImageMetadataExtractor(), NoteMetadataExtractor(), BaseMetadataExtractor()] + + if split == "node": + from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor + constructors, default_constructor = {"tree": TreeConstructor()}, TreeConstructor() + else: + from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor + constructors, default_constructor = {"linear": LinearConstructor()}, LinearConstructor() + + return dict( + converter=ConverterComposition(converters=converters), + reader=ReaderComposition(readers=readers), + structure_extractor=StructureExtractorComposition(extractors={"other": DefaultStructureExtractor()}, default_key="other"), + structure_constructor=StructureConstructorComposition(constructors=constructors, default_constructor=default_constructor), + document_metadata_extractor=MetadataExtractorComposition(extractors=metadata_extractors), + attachments_handler=AttachmentsHandler() + ) diff --git a/docker-compose.yml b/docker-compose.yml index 58d36ef2..88ea8a14 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,6 +21,7 @@ services: - dedoc build: context: . + dockerfile: tests/Dockerfile tty: true environment: DOC_READER_HOST: "dedoc" @@ -28,9 +29,6 @@ services: GROBID_HOST: "grobid" GROBID_PORT: 8070 is_test: $test - PYTHONPATH: $PYTHONPATH:/dedoc_root/tests:/dedoc_root - command: - bash dedoc_root/tests/run_tests_in_docker.sh grobid: image: "lfoppiano/grobid:0.8.0" diff --git a/docs/source/_static/code_examples/article_classifier.pkl.gz b/docs/source/_static/code_examples/article_classifier.pkl.gz deleted file mode 100644 index a2f2355b..00000000 Binary files a/docs/source/_static/code_examples/article_classifier.pkl.gz and /dev/null differ diff --git a/docs/source/_static/code_examples/article_classifier.zip b/docs/source/_static/code_examples/article_classifier.zip new file mode 100644 index 00000000..98107cf1 Binary files /dev/null and b/docs/source/_static/code_examples/article_classifier.zip differ diff --git a/docs/source/_static/code_examples/article_structure_extractor.py b/docs/source/_static/code_examples/article_structure_extractor.py index cbb5d6cd..f59fb6a1 100644 --- a/docs/source/_static/code_examples/article_structure_extractor.py +++ b/docs/source/_static/code_examples/article_structure_extractor.py @@ -19,7 +19,7 @@ class ArticleStructureExtractor(AbstractStructureExtractor): def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config) path = os.path.abspath(os.path.dirname(__file__)) # path to the directory where the classifier weights are located - self.classifier = ArticleLineTypeClassifier(path=os.path.join(path, "article_classifier.pkl.gz"), config=self.config) + self.classifier = ArticleLineTypeClassifier(path=os.path.join(path, "article_classifier.zip"), config=self.config) self.named_item_keywords = ("abstract", "introduction", "related work", "conclusion", "references", "appendix", "acknowledgements") diff --git a/docs/source/_static/code_examples/langchain/dedoc_loader.py b/docs/source/_static/code_examples/langchain/dedoc_loader.py new file mode 100644 index 00000000..2e037694 --- /dev/null +++ b/docs/source/_static/code_examples/langchain/dedoc_loader.py @@ -0,0 +1,416 @@ +from abc import ABC, abstractmethod +from typing import ( + Dict, + Iterator, + Optional, + Tuple, + Union, +) + +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader + + +class DedocBaseLoader(BaseLoader, ABC): + """ + Base Loader that uses `dedoc` (https://dedoc.readthedocs.io). + + Loader enables extracting text, tables and attached files from the given file: + * `Text` can be split by pages, `dedoc` tree nodes, textual lines + (according to the `split` parameter). + * `Attached files` (when with_attachments=True) + are split according to the `split` parameter. + For attachments, langchain Document object has an additional metadata field + `type`="attachment". + * `Tables` (when with_tables=True) are not split - each table corresponds to one + langchain Document object. + For tables, Document object has additional metadata fields `type`="table" + and `text_as_html` with table HTML representation. + """ + + def __init__( + self, + file_path: str, + split: str = "document", + with_tables: bool = True, + **dedoc_kwargs: Union[str, bool], + ) -> None: + """ + Initialize with file path and parsing parameters. + + Args: + file_path: path to the file for processing + split: type of document splitting into parts (each part is returned + separately), default value "document" + "document": document text is returned as a single langchain Document + object (don't split) + "page": split document text into pages (works for PDF, DJVU, PPTX, PPT, + ODP) + "node": split document text into tree nodes (title nodes, list item + nodes, raw text nodes) + "line": split document text into lines + with_tables: add tables to the result - each table is returned as a single + langchain Document object + + dedoc_kwargs: parameters used for document parsing via `dedoc` + (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html). + with_attachments: enable attached files extraction + recursion_deep_attachments: recursion level for attached files + extraction, works only when with_attachments==True + pdf_with_text_layer: type of handler for parsing PDF documents, + available options + ["true", "false", "tabby", "auto", "auto_tabby" (default)] + language: language of the document for PDF without a textual layer and + images, available options ["eng", "rus", "rus+eng" (default)], + the list of languages can be extended, please see + https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html + pages: page slice to define the reading range for parsing PDF documents + is_one_column_document: detect number of columns for PDF without + a textual layer and images, available options + ["true", "false", "auto" (default)] + document_orientation: fix document orientation (90, 180, 270 degrees) + for PDF without a textual layer and images, available options + ["auto" (default), "no_change"] + need_header_footer_analysis: remove headers and footers from the output + result for parsing PDF and images + need_binarization: clean pages background (binarize) for PDF without a + textual layer and images + need_pdf_table_analysis: parse tables for PDF without a textual layer + and images + delimiter: column separator for CSV, TSV files + encoding: encoding of TXT, CSV, TSV + """ + self.valid_split_values = {"document", "page", "node", "line"} + if split not in self.valid_split_values: + raise ValueError( + f"Got {split} for `split`, but should be one of " + f"`{self.valid_split_values}`" + ) + self.split = split + + self.with_tables = with_tables + self.file_path = file_path + with_attachments = str(dedoc_kwargs.get("with_attachments", "false")).lower() + self.parsing_parameters = { + **dedoc_kwargs, + **{ + "structure_type": "tree" if self.split == "node" else "linear", + "document_type": "other", + "need_content_analysis": with_attachments, + }, + } + + def lazy_load(self) -> Iterator[Document]: + """Lazily load documents.""" + import tempfile + + try: + from dedoc import DedocManager + except ImportError: + raise ImportError( + "`dedoc` package not found, please install it with `pip install dedoc`" + ) + dedoc_manager = DedocManager(manager_config=self._make_config()) + dedoc_manager.config["logger"].disabled = True + + with tempfile.TemporaryDirectory() as tmpdir: + document_tree = dedoc_manager.parse( + file_path=self.file_path, + parameters={**self.parsing_parameters, "attachments_dir": tmpdir}, + ) + yield from self._split_document( + document_tree=document_tree.to_api_schema().dict(), split=self.split + ) + + @abstractmethod + def _make_config(self) -> dict: + """ + Make configuration for DedocManager according to the file extension and + parsing parameters. + """ + pass + + def _json2txt(self, paragraph: dict) -> str: + """Get text (recursively) of the document tree node.""" + subparagraphs_text = "\n".join( + [ + self._json2txt(subparagraph) + for subparagraph in paragraph["subparagraphs"] + ] + ) + text = ( + f"{paragraph['text']}\n{subparagraphs_text}" + if subparagraphs_text + else paragraph["text"] + ) + return text + + def _parse_subparagraphs( + self, document_tree: dict, document_metadata: dict + ) -> Iterator[Document]: + """Parse recursively document tree obtained by `dedoc`.""" + if len(document_tree["subparagraphs"]) > 0: + for subparagraph in document_tree["subparagraphs"]: + yield from self._parse_subparagraphs( + document_tree=subparagraph, document_metadata=document_metadata + ) + else: + yield Document( + page_content=document_tree["text"], + metadata={**document_metadata, **document_tree["metadata"]}, + ) + + def _split_document( + self, + document_tree: dict, + split: str, + additional_metadata: Optional[dict] = None, + ) -> Iterator[Document]: + """Split document into parts according to the `split` parameter.""" + document_metadata = document_tree["metadata"] + if additional_metadata: + document_metadata = {**document_metadata, **additional_metadata} + + if split == "document": + text = self._json2txt(paragraph=document_tree["content"]["structure"]) + yield Document(page_content=text, metadata=document_metadata) + + elif split == "page": + nodes = document_tree["content"]["structure"]["subparagraphs"] + page_id = nodes[0]["metadata"]["page_id"] + page_text = "" + + for node in nodes: + if node["metadata"]["page_id"] == page_id: + page_text += self._json2txt(node) + else: + yield Document( + page_content=page_text, + metadata={**document_metadata, "page_id": page_id}, + ) + page_id = node["metadata"]["page_id"] + page_text = self._json2txt(node) + + yield Document( + page_content=page_text, + metadata={**document_metadata, "page_id": page_id}, + ) + + elif split == "line": + for node in document_tree["content"]["structure"]["subparagraphs"]: + line_metadata = node["metadata"] + yield Document( + page_content=self._json2txt(node), + metadata={**document_metadata, **line_metadata}, + ) + + elif split == "node": + yield from self._parse_subparagraphs( + document_tree=document_tree["content"]["structure"], + document_metadata=document_metadata, + ) + + else: + raise ValueError( + f"Got {split} for `split`, but should be one of " + f"`{self.valid_split_values}`" + ) + + if self.with_tables: + for table in document_tree["content"]["tables"]: + table_text, table_html = self._get_table(table) + yield Document( + page_content=table_text, + metadata={ + **table["metadata"], + "type": "table", + "text_as_html": table_html, + }, + ) + + for attachment in document_tree["attachments"]: + yield from self._split_document( + document_tree=attachment, + split=self.split, + additional_metadata={"type": "attachment"}, + ) + + def _get_table(self, table: dict) -> Tuple[str, str]: + """Get text and HTML representation of the table.""" + table_text = "" + for row in table["cells"]: + for cell in row: + table_text += " ".join(line["text"] for line in cell["lines"]) + table_text += "\t" + table_text += "\n" + + table_html = ( + '