Skip to content

Commit

Permalink
update master (#472)
Browse files Browse the repository at this point in the history
Co-authored-by: Alexander Golodkov <[email protected]>
Co-authored-by: Andrew Perminov <[email protected]>
Co-authored-by: Bogatenkova Anastasiya <[email protected]>
  • Loading branch information
4 people authored Jul 15, 2024
1 parent 5750d57 commit 8a2678c
Show file tree
Hide file tree
Showing 34 changed files with 957 additions and 119 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ per-file-ignores =
scripts/benchmark_pdf_performance*:JS101
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
docs/source/_static/code_examples/*:I251
docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
17 changes: 7 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
ARG REPOSITORY="docker.io"
FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
ARG LANGUAGES=""
RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done
RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$(echo $lang | tr "_" "-"); done

ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
ENV RESOURCES_PATH "/dedoc_root/resources"

ADD requirements.txt .
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt

RUN mkdir /dedoc_root
RUN mkdir /dedoc_root/dedoc
ADD dedoc/config.py /dedoc_root/dedoc/config.py
ADD dedoc/download_models.py /dedoc_root/dedoc/download_models.py
COPY dedoc/config.py /dedoc_root/dedoc/config.py
COPY dedoc/download_models.py /dedoc_root/dedoc/download_models.py
RUN python3 /dedoc_root/dedoc/download_models.py

ADD dedoc /dedoc_root/dedoc
ADD VERSION /dedoc_root
COPY dedoc /dedoc_root/dedoc
COPY VERSION /dedoc_root
RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py

ADD tests /dedoc_root/tests
ADD resources /dedoc_root/resources

CMD ["python3", "/dedoc_root/dedoc/main.py"]
CMD [ "python3", "/dedoc_root/dedoc/main.py" ]
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.2.4
2.2.5
20 changes: 10 additions & 10 deletions dedoc/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
Keys are the names of repositories with models.
"""
model_hash_dict = dict(
txtlayer_classifier="94e27e184fa2876883d260e0aa58b042e6ab3e35",
txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f",
scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58",
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013",
fintoc_classifiers="42f8ada99a5da608139b078c93bebfffc5b30263"
paragraph_classifier="c26a10193499d3cbc77ffec9842bece24fa8950b",
line_type_classifiers="0568c6e1f49612c0c351f10b80a26dc05f796683",
fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8"
)


Expand All @@ -27,29 +27,29 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str
def download(resources_path: str) -> None:
import os

download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz")
download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.json", repo_name="txtlayer_classifier", hub_name="model.json")

download_from_hub(out_dir=resources_path,
out_name="scan_orientation_efficient_net_b0.pth",
repo_name="scan_orientation_efficient_net_b0",
hub_name="model.pth")

download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.pkl.gz", repo_name="paragraph_classifier", hub_name="model.pkl.gz")
download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.zip", repo_name="paragraph_classifier", hub_name="model.zip")

line_clf_resources_path = os.path.join(resources_path, "line_type_classifiers")
for classifier_type in ("diploma", "law", "law_txt", "tz", "tz_txt"):
download_from_hub(out_dir=line_clf_resources_path,
out_name=f"{classifier_type}_classifier.pkl.gz",
out_name=f"{classifier_type}_classifier.zip",
repo_name="line_type_classifiers",
hub_name=f"{classifier_type}.pkl.gz")
hub_name=f"{classifier_type}.zip")

fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers")
for language in ("en", "fr", "sp"):
for classifier_type in ("target", "binary"):
download_from_hub(out_dir=fintoc_classifiers_resources_path,
out_name=f"{classifier_type}_classifier_{language}.pkg.gz",
out_name=f"{classifier_type}_classifier_{language}.json",
repo_name="fintoc_classifiers",
hub_name=f"{classifier_type}_classifier_{language}_txt_layer.pkg.gz")
hub_name=f"{classifier_type}_classifier_{language}_txt_layer.json")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion dedoc/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


converted_extensions = Extensions(
excel_like_format={".ods", "xls"},
excel_like_format={".ods", ".xls"},
docx_like_format={".odt", ".doc", ".rtf"},
pptx_like_format={".odp", ".ppt"},
html_like_format={},
Expand Down
26 changes: 14 additions & 12 deletions dedoc/readers/email_reader/email_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
import os
import uuid
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments
from dedoc.utils.utils import get_unique_name

parameters = {} if parameters is None else parameters
attachments_dir = get_param_attachments_dir(parameters, file_path)
with_attachments = get_param_with_attachments(parameters)
need_content_analysis = get_param_need_content_analysis(parameters)

with open(file_path, "rb") as f:
msg = email.message_from_binary_file(f)
Expand All @@ -58,16 +60,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
lines = self.__get_main_fields(msg)
header_filename = "message_header_" + get_unique_name("message_header.json")

# saving message header into separated file as an attachment
header_file_path = os.path.join(attachments_dir, header_filename)
with open(header_file_path, "w", encoding="utf-8") as f:
json.dump(all_header_fields, f, ensure_ascii=False, indent=4)

need_content_analysis = get_param_need_content_analysis(parameters)
attachments.append(AttachedFile(original_name=header_filename,
tmp_file_path=header_file_path,
uid=f"attach_{uuid.uuid1()}",
need_content_analysis=need_content_analysis))
if with_attachments:
# saving message header into separated file as an attachment
header_file_path = os.path.join(attachments_dir, header_filename)
with open(header_file_path, "w", encoding="utf-8") as f:
json.dump(all_header_fields, f, ensure_ascii=False, indent=4)
attachments.append(AttachedFile(original_name=header_filename,
tmp_file_path=header_file_path,
uid=f"attach_{uuid.uuid1()}",
need_content_analysis=need_content_analysis))

html_found = False
text_parts = []
Expand All @@ -92,7 +93,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
if part.is_multipart():
continue

self.__add_attachment(part, attachments_dir, attachments, need_content_analysis)
if with_attachments:
self.__add_attachment(part, attachments_dir, attachments, need_content_analysis)

# text/plain has the same content as text/html
if not html_found:
Expand Down
13 changes: 9 additions & 4 deletions dedoc/readers/mhtml_reader/mhtml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments

parameters = {} if parameters is None else parameters
attachments_dir = get_param_attachments_dir(parameters, file_path)
Expand All @@ -51,16 +51,21 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
lines.extend(result.lines)
tables.extend(result.tables)

need_content_analysis = get_param_need_content_analysis(parameters)
tmp_file_names = []
original_file_names = []
for tmp_file_name, original_file_name in zip(names_list, original_names_list):
if tmp_file_name not in names_html:
tmp_file_names.append(tmp_file_name)
original_file_names.append(original_file_name)

attachments = self.__get_attachments(save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names,
need_content_analysis=need_content_analysis)
with_attachments = get_param_with_attachments(parameters)
need_content_analysis = get_param_need_content_analysis(parameters)
if with_attachments:
attachments = self.__get_attachments(
save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names, need_content_analysis=need_content_analysis
)
else:
attachments = []

return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments)

Expand Down
10 changes: 4 additions & 6 deletions dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import gzip
import logging
import os
import pickle
from typing import List

from xgboost import XGBClassifier
Expand All @@ -22,7 +20,7 @@ def __init__(self, *, config: dict) -> None:
self.logger = config.get("logger", logging.getLogger())

self.feature_extractor = TxtlayerFeatureExtractor()
self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.pkl.gz")
self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.json")
self.__model = None

@property
Expand All @@ -32,11 +30,11 @@ def __get_model(self) -> XGBClassifier:

if not os.path.isfile(self.path):
out_dir, out_name = os.path.split(self.path)
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.pkl.gz")
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.json")

assert os.path.isfile(self.path)
with gzip.open(self.path, "rb") as f:
self.__model = pickle.load(f)
self.__model = XGBClassifier()
self.__model.load_model(self.path)

if get_param_gpu_available(self.config, self.logger):
gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import gzip
import json
import logging
import os
import pickle
import tempfile
import zipfile
from typing import List

from xgboost import XGBClassifier
Expand All @@ -21,7 +22,7 @@ class ScanParagraphClassifierExtractor(object):
def __init__(self, *, config: dict) -> None:
super().__init__()
self.logger = config.get("logger", logging.getLogger())
self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.pkl.gz")
self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.zip")
self.config = config
self._feature_extractor = None
self._classifier = None
Expand All @@ -41,11 +42,17 @@ def classifier(self) -> XGBClassifier:
def _unpickle(self) -> None:
if not os.path.isfile(self.path):
out_dir, out_name = os.path.split(self.path)
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.pkl.gz")
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.zip")

with gzip.open(self.path) as file:
self._classifier, parameters = pickle.load(file)
self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config)
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(self.path) as archive:
archive.extractall(tmpdir)

with open(os.path.join(tmpdir, "parameters.json")) as parameters_file:
parameters = json.load(parameters_file)
self._classifier = XGBClassifier()
self._classifier.load_model(os.path.join(tmpdir, "classifier.json"))
self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config)

if get_param_gpu_available(self.config, self.logger):
gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
from dedoc.structure_extractors.line_type_classifiers.law_classifier import LawLineTypeClassifier

path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=self.config)
self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=self.config)
self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.zip"), config=self.config)
self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.zip"), config=self.config)
self.hierarchy_level_builders = [StubHierarchyLevelBuilder()]
self.hl_type = "law"
self.init_hl_depth = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
self.toc_builder = TocBuilder()
self.body_builder = DiplomaBodyBuilder()
path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=self.config)
self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.zip"), config=self.config)
self.footnote_start_regexp = re.compile(r"^\d+ ")

def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
self.body_builder = TzBodyBuilder()
self.toc_builder = TocBuilder()
path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=self.config)
self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=self.config)
self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.zip"), config=self.config)
self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.zip"), config=self.config)

def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import gzip
import json
import logging
import os
import pickle
import tempfile
import zipfile
from abc import ABC
from typing import Optional, Tuple

Expand Down Expand Up @@ -32,10 +33,16 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]:
"""
if not os.path.isfile(path):
out_dir, out_name = os.path.split(path)
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.pkl.gz")
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.zip")

with gzip.open(path) as file:
classifier, feature_extractor_parameters = pickle.load(file)
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(path) as archive:
archive.extractall(tmpdir)

with open(os.path.join(tmpdir, "parameters.json")) as parameters_file:
feature_extractor_parameters = json.load(parameters_file)
classifier = XGBClassifier()
classifier.load_model(os.path.join(tmpdir, "classifier.json"))

if get_param_gpu_available(self.config, self.logger):
gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)
Expand All @@ -44,19 +51,27 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]:

return classifier, feature_extractor_parameters

def save(self, path_out: str, object_for_saving: object) -> str:
@staticmethod
def save(path_out: str, classifier: XGBClassifier, parameters: dict) -> str:
"""
Save the pickled classifier (with initialization parameters for a feature extractor) into the `.pkl.gz` file with path=`path_out`
Save the classifier (with initialization parameters for a feature extractor) into the `.zip` file with path=`path_out`
* classifier -> classifier.json
* parameters -> parameters.json
:param path_out: path (with file name) where to save the object
:param object_for_saving: classifier with feature extractor's parameters to save
:param classifier: classifier to save
:param parameters: feature extractor parameters to save
:return: the resulting path of the saved file
"""
if path_out.endswith(".pkl"):
path_out += ".gz"
elif not path_out.endswith(".gz"):
path_out += ".pkl.gz"
with tempfile.TemporaryDirectory() as tmpdir:
clf_path = os.path.join(tmpdir, "classifier.json")
params_path = os.path.join(tmpdir, "parameters.json")
classifier.save_model(clf_path)
with open(params_path, "w") as out_file:
json.dump(parameters, out_file)

with gzip.open(path_out, "wb") as file_out:
pickle.dump(obj=object_for_saving, file=file_out)
with zipfile.ZipFile(path_out, "w") as archive:
archive.write(clf_path, os.path.basename(clf_path))
archive.write(params_path, os.path.basename(params_path))
return path_out
Loading

0 comments on commit 8a2678c

Please sign in to comment.