update master (#472)

Co-authored-by: Alexander Golodkov <[email protected]> Co-authored-by: Andrew Perminov <[email protected]> Co-authored-by: Bogatenkova Anastasiya <[email protected]>
ispras · Jul 15, 2024 · 8a2678c · 8a2678c
1 parent 5750d57
commit 8a2678c
Show file tree

Hide file tree

Showing 34 changed files with 957 additions and 119 deletions.
diff --git a/.flake8 b/.flake8
@@ -49,3 +49,4 @@ per-file-ignores =
     scripts/benchmark_pdf_performance*:JS101
     tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
     docs/source/_static/code_examples/*:I251
+    docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
diff --git a/Dockerfile b/Dockerfile
@@ -1,25 +1,22 @@
 ARG REPOSITORY="docker.io"
 FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
 ARG LANGUAGES=""
-RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done
+RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$(echo $lang | tr "_" "-"); done
 
 ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
 ENV RESOURCES_PATH "/dedoc_root/resources"
 
-ADD requirements.txt .
+COPY requirements.txt .
 RUN pip3 install --no-cache-dir -r requirements.txt
 
 RUN mkdir /dedoc_root
 RUN mkdir /dedoc_root/dedoc
-ADD dedoc/config.py /dedoc_root/dedoc/config.py
-ADD dedoc/download_models.py /dedoc_root/dedoc/download_models.py
+COPY dedoc/config.py /dedoc_root/dedoc/config.py
+COPY dedoc/download_models.py /dedoc_root/dedoc/download_models.py
 RUN python3 /dedoc_root/dedoc/download_models.py
 
-ADD dedoc /dedoc_root/dedoc
-ADD VERSION /dedoc_root
+COPY dedoc /dedoc_root/dedoc
+COPY VERSION /dedoc_root
 RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py
 
-ADD tests /dedoc_root/tests
-ADD resources /dedoc_root/resources
-
-CMD ["python3", "/dedoc_root/dedoc/main.py"]
+CMD [ "python3", "/dedoc_root/dedoc/main.py" ]
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.2.4
+2.2.5
diff --git a/dedoc/download_models.py b/dedoc/download_models.py
@@ -5,12 +5,12 @@
 Keys are the names of repositories with models.
 """
 model_hash_dict = dict(
-    txtlayer_classifier="94e27e184fa2876883d260e0aa58b042e6ab3e35",
+    txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f",
     scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58",
     font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
-    paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
-    line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013",
-    fintoc_classifiers="42f8ada99a5da608139b078c93bebfffc5b30263"
+    paragraph_classifier="c26a10193499d3cbc77ffec9842bece24fa8950b",
+    line_type_classifiers="0568c6e1f49612c0c351f10b80a26dc05f796683",
+    fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8"
 )
 
 
@@ -27,29 +27,29 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str
 def download(resources_path: str) -> None:
     import os
 
-    download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz")
+    download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.json", repo_name="txtlayer_classifier", hub_name="model.json")
 
     download_from_hub(out_dir=resources_path,
                       out_name="scan_orientation_efficient_net_b0.pth",
                       repo_name="scan_orientation_efficient_net_b0",
                       hub_name="model.pth")
 
-    download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.pkl.gz", repo_name="paragraph_classifier", hub_name="model.pkl.gz")
+    download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.zip", repo_name="paragraph_classifier", hub_name="model.zip")
 
     line_clf_resources_path = os.path.join(resources_path, "line_type_classifiers")
     for classifier_type in ("diploma", "law", "law_txt", "tz", "tz_txt"):
         download_from_hub(out_dir=line_clf_resources_path,
-                          out_name=f"{classifier_type}_classifier.pkl.gz",
+                          out_name=f"{classifier_type}_classifier.zip",
                           repo_name="line_type_classifiers",
-                          hub_name=f"{classifier_type}.pkl.gz")
+                          hub_name=f"{classifier_type}.zip")
 
     fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers")
     for language in ("en", "fr", "sp"):
         for classifier_type in ("target", "binary"):
             download_from_hub(out_dir=fintoc_classifiers_resources_path,
-                              out_name=f"{classifier_type}_classifier_{language}.pkg.gz",
+                              out_name=f"{classifier_type}_classifier_{language}.json",
                               repo_name="fintoc_classifiers",
-                              hub_name=f"{classifier_type}_classifier_{language}_txt_layer.pkg.gz")
+                              hub_name=f"{classifier_type}_classifier_{language}_txt_layer.json")
 
 
 if __name__ == "__main__":

diff --git a/dedoc/extensions.py b/dedoc/extensions.py
@@ -19,7 +19,7 @@
 
 
 converted_extensions = Extensions(
-    excel_like_format={".ods", "xls"},
+    excel_like_format={".ods", ".xls"},
     docx_like_format={".odt", ".doc", ".rtf"},
     pptx_like_format={".odp", ".ppt"},
     html_like_format={},

diff --git a/dedoc/readers/email_reader/email_reader.py b/dedoc/readers/email_reader/email_reader.py
@@ -44,11 +44,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         import os
         import uuid
         from dedoc.data_structures.attached_file import AttachedFile
-        from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
+        from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments
         from dedoc.utils.utils import get_unique_name
 
         parameters = {} if parameters is None else parameters
         attachments_dir = get_param_attachments_dir(parameters, file_path)
+        with_attachments = get_param_with_attachments(parameters)
+        need_content_analysis = get_param_need_content_analysis(parameters)
 
         with open(file_path, "rb") as f:
             msg = email.message_from_binary_file(f)
@@ -58,16 +60,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         lines = self.__get_main_fields(msg)
         header_filename = "message_header_" + get_unique_name("message_header.json")
 
-        # saving message header into separated file as an attachment
-        header_file_path = os.path.join(attachments_dir, header_filename)
-        with open(header_file_path, "w", encoding="utf-8") as f:
-            json.dump(all_header_fields, f, ensure_ascii=False, indent=4)
-
-        need_content_analysis = get_param_need_content_analysis(parameters)
-        attachments.append(AttachedFile(original_name=header_filename,
-                                        tmp_file_path=header_file_path,
-                                        uid=f"attach_{uuid.uuid1()}",
-                                        need_content_analysis=need_content_analysis))
+        if with_attachments:
+            # saving message header into separated file as an attachment
+            header_file_path = os.path.join(attachments_dir, header_filename)
+            with open(header_file_path, "w", encoding="utf-8") as f:
+                json.dump(all_header_fields, f, ensure_ascii=False, indent=4)
+            attachments.append(AttachedFile(original_name=header_filename,
+                                            tmp_file_path=header_file_path,
+                                            uid=f"attach_{uuid.uuid1()}",
+                                            need_content_analysis=need_content_analysis))
 
         html_found = False
         text_parts = []
@@ -92,7 +93,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
             if part.is_multipart():
                 continue
 
-            self.__add_attachment(part, attachments_dir, attachments, need_content_analysis)
+            if with_attachments:
+                self.__add_attachment(part, attachments_dir, attachments, need_content_analysis)
 
         # text/plain has the same content as text/html
         if not html_found:

diff --git a/dedoc/readers/mhtml_reader/mhtml_reader.py b/dedoc/readers/mhtml_reader/mhtml_reader.py
@@ -36,7 +36,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
-        from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
+        from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments
 
         parameters = {} if parameters is None else parameters
         attachments_dir = get_param_attachments_dir(parameters, file_path)
@@ -51,16 +51,21 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
             lines.extend(result.lines)
             tables.extend(result.tables)
 
-        need_content_analysis = get_param_need_content_analysis(parameters)
         tmp_file_names = []
         original_file_names = []
         for tmp_file_name, original_file_name in zip(names_list, original_names_list):
             if tmp_file_name not in names_html:
                 tmp_file_names.append(tmp_file_name)
                 original_file_names.append(original_file_name)
 
-        attachments = self.__get_attachments(save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names,
-                                             need_content_analysis=need_content_analysis)
+        with_attachments = get_param_with_attachments(parameters)
+        need_content_analysis = get_param_need_content_analysis(parameters)
+        if with_attachments:
+            attachments = self.__get_attachments(
+                save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names, need_content_analysis=need_content_analysis
+            )
+        else:
+            attachments = []
 
         return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments)
 

diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py
@@ -1,7 +1,5 @@
-import gzip
 import logging
 import os
-import pickle
 from typing import List
 
 from xgboost import XGBClassifier
@@ -22,7 +20,7 @@ def __init__(self, *, config: dict) -> None:
         self.logger = config.get("logger", logging.getLogger())
 
         self.feature_extractor = TxtlayerFeatureExtractor()
-        self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.pkl.gz")
+        self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.json")
         self.__model = None
 
     @property
@@ -32,11 +30,11 @@ def __get_model(self) -> XGBClassifier:
 
         if not os.path.isfile(self.path):
             out_dir, out_name = os.path.split(self.path)
-            download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.pkl.gz")
+            download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.json")
 
         assert os.path.isfile(self.path)
-        with gzip.open(self.path, "rb") as f:
-            self.__model = pickle.load(f)
+        self.__model = XGBClassifier()
+        self.__model.load_model(self.path)
 
         if get_param_gpu_available(self.config, self.logger):
             gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)

diff --git a/...rs/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py b/...rs/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py
@@ -1,7 +1,8 @@
-import gzip
+import json
 import logging
 import os
-import pickle
+import tempfile
+import zipfile
 from typing import List
 
 from xgboost import XGBClassifier
@@ -21,7 +22,7 @@ class ScanParagraphClassifierExtractor(object):
     def __init__(self, *, config: dict) -> None:
         super().__init__()
         self.logger = config.get("logger", logging.getLogger())
-        self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.pkl.gz")
+        self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.zip")
         self.config = config
         self._feature_extractor = None
         self._classifier = None
@@ -41,11 +42,17 @@ def classifier(self) -> XGBClassifier:
     def _unpickle(self) -> None:
         if not os.path.isfile(self.path):
             out_dir, out_name = os.path.split(self.path)
-            download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.pkl.gz")
+            download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.zip")
 
-        with gzip.open(self.path) as file:
-            self._classifier, parameters = pickle.load(file)
-            self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with zipfile.ZipFile(self.path) as archive:
+                archive.extractall(tmpdir)
+
+            with open(os.path.join(tmpdir, "parameters.json")) as parameters_file:
+                parameters = json.load(parameters_file)
+            self._classifier = XGBClassifier()
+            self._classifier.load_model(os.path.join(tmpdir, "classifier.json"))
+        self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config)
 
         if get_param_gpu_available(self.config, self.logger):
             gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)

diff --git a/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py
@@ -26,8 +26,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
         from dedoc.structure_extractors.line_type_classifiers.law_classifier import LawLineTypeClassifier
 
         path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
-        self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=self.config)
-        self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=self.config)
+        self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.zip"), config=self.config)
+        self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.zip"), config=self.config)
         self.hierarchy_level_builders = [StubHierarchyLevelBuilder()]
         self.hl_type = "law"
         self.init_hl_depth = 1

diff --git a/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py
@@ -32,7 +32,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
         self.toc_builder = TocBuilder()
         self.body_builder = DiplomaBodyBuilder()
         path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
-        self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=self.config)
+        self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.zip"), config=self.config)
         self.footnote_start_regexp = re.compile(r"^\d+ ")
 
     def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:

diff --git a/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py
@@ -29,8 +29,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
         self.body_builder = TzBodyBuilder()
         self.toc_builder = TocBuilder()
         path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
-        self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=self.config)
-        self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=self.config)
+        self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.zip"), config=self.config)
+        self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.zip"), config=self.config)
 
     def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """

diff --git a/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py b/dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py
@@ -1,7 +1,8 @@
-import gzip
+import json
 import logging
 import os
-import pickle
+import tempfile
+import zipfile
 from abc import ABC
 from typing import Optional, Tuple
 
@@ -32,10 +33,16 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]:
         """
         if not os.path.isfile(path):
             out_dir, out_name = os.path.split(path)
-            download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.pkl.gz")
+            download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.zip")
 
-        with gzip.open(path) as file:
-            classifier, feature_extractor_parameters = pickle.load(file)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with zipfile.ZipFile(path) as archive:
+                archive.extractall(tmpdir)
+
+            with open(os.path.join(tmpdir, "parameters.json")) as parameters_file:
+                feature_extractor_parameters = json.load(parameters_file)
+            classifier = XGBClassifier()
+            classifier.load_model(os.path.join(tmpdir, "classifier.json"))
 
         if get_param_gpu_available(self.config, self.logger):
             gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)
@@ -44,19 +51,27 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]:
 
         return classifier, feature_extractor_parameters
 
-    def save(self, path_out: str, object_for_saving: object) -> str:
+    @staticmethod
+    def save(path_out: str, classifier: XGBClassifier, parameters: dict) -> str:
         """
-        Save the pickled classifier (with initialization parameters for a feature extractor) into the `.pkl.gz` file with path=`path_out`
+        Save the classifier (with initialization parameters for a feature extractor) into the `.zip` file with path=`path_out`
+
+        * classifier -> classifier.json
+        * parameters -> parameters.json
 
         :param path_out: path (with file name) where to save the object
-        :param object_for_saving: classifier with feature extractor's parameters to save
+        :param classifier: classifier to save
+        :param parameters: feature extractor parameters to save
         :return: the resulting path of the saved file
         """
-        if path_out.endswith(".pkl"):
-            path_out += ".gz"
-        elif not path_out.endswith(".gz"):
-            path_out += ".pkl.gz"
+        with tempfile.TemporaryDirectory() as tmpdir:
+            clf_path = os.path.join(tmpdir, "classifier.json")
+            params_path = os.path.join(tmpdir, "parameters.json")
+            classifier.save_model(clf_path)
+            with open(params_path, "w") as out_file:
+                json.dump(parameters, out_file)
 
-        with gzip.open(path_out, "wb") as file_out:
-            pickle.dump(obj=object_for_saving, file=file_out)
+            with zipfile.ZipFile(path_out, "w") as archive:
+                archive.write(clf_path, os.path.basename(clf_path))
+                archive.write(params_path, os.path.basename(params_path))
         return path_out