Skip to content

Commit

Permalink
fix: spacy model download with pyinstaller
Browse files Browse the repository at this point in the history
  • Loading branch information
ClemDoum committed Nov 28, 2024
1 parent 3c11ddb commit b713a5a
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ jobs:
-F \
-y \
--name "datashare-spacy-worker$BIN_SUFFIX" \
--hidden-import spacy \
--additional-hooks-dir hooks \
--specpath ./bin \
--add-data ../datashare_spacy_worker/data/models.json:datashare_spacy_worker/data/ \
-p datashare_spacy_worker
Expand Down
1 change: 1 addition & 0 deletions datashare_spacy_worker/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
PYTHON_TASK_GROUP = "PYTHON"
_SPACY_PIPELINE = "SPACY"


@app.task(name="BatchNlp", group=TaskGroup(name=PYTHON_TASK_GROUP))
async def spacy_ner(
docs: list[dict],
Expand Down
47 changes: 41 additions & 6 deletions datashare_spacy_worker/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,25 @@

import json
import logging
import shutil
import urllib.request
import zipfile
from functools import lru_cache
from pathlib import Path
from typing import Any, Generator, Iterable, Iterator
from urllib.parse import urljoin

import spacy
from icij_worker.typing_ import RateProgress
from icij_worker.utils.progress import to_raw_progress
from spacy import Language as SpacyLanguage
from spacy.cli import download
from spacy import Language as SpacyLanguage, about as spacy_about
from spacy.cli.download import (
get_compatibility,
get_model_filename,
get_version,
)
from spacy.tokens import Doc as SpacyDoc, Span
from spacy.util import is_package
from spacy.util import load_model_from_init_py

from datashare_spacy_worker.constants import DATA_DIR
from datashare_spacy_worker.ner_label_scheme import NERLabelScheme
Expand Down Expand Up @@ -111,10 +120,15 @@ def _load_nlp(self, language: str, *, model_size: SpacySize) -> SpacyLanguage:
# TODO: use GPU acceleration using spacy.prefer_gpu() + spacy[cuda]
# TODO: check if we can exclude globally or we must do it language per langauge
exclude = model.get("exclude", _DEFAULT_EXCLUDE)
if not is_package(model_name):
# We have to do some dark magic because of pyinstaller...
model_path = DATA_DIR / model_name
if not model_path.exists():
logger.info("downloading spacy model %s...", model_name)
download(model_name)
return spacy.load(model_name, exclude=exclude)
# Hack due to the fact pyinstaller doesn't support a python script from
# the pyinstaller binary. Sadly this is the case when installing a spacy
# model...
_fixed_pysinstaller_download(model_name, model_path)
return load_model_from_init_py(model_path / "__init__.py", exclude=exclude)


def _merge_subdocs(sub_docs: list[SpacyDoc]) -> SpacyDoc:
Expand Down Expand Up @@ -148,3 +162,24 @@ def _spacy_doc_to_ds_tag(
return None
start = ent.start_char
return NlpTag(start=start, mention=ent.text, category=category)


def _fixed_pysinstaller_download(model_name: str, model_path: Path):
# Very ugly by pyinstaller forces to do ugly things...
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
filename = get_model_filename(model_name, version)
base_url = spacy_about.__download_url__
# urljoin requires that the path ends with /, or the last path part will be
# dropped
if not base_url.endswith("/"):
base_url = spacy_about.__download_url__ + "/"
download_url = urljoin(base_url, filename)
wheel_name = Path(filename).name
wheel_path = model_path.parent / wheel_name
urllib.request.urlretrieve(download_url, wheel_path)
with zipfile.ZipFile(wheel_path, "r") as zip_ref:
zip_ref.extractall(model_path.parent)
for dir in model_path.parent.iterdir():
if dir.is_dir() and dir.name.endswith(".dist-info"):
shutil.rmtree(dir)
65 changes: 65 additions & 0 deletions hooks/hook-spacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# pylint: disable=invalid-name
from PyInstaller.utils.hooks import collect_data_files

datas = collect_data_files("spacy", False)

datas.extend(collect_data_files("spacy.lang", include_py_files=True))
datas.extend(collect_data_files("spacy_lookups_data"))
datas.extend(collect_data_files("thinc"))

hiddenimports = [
"spacy.parts_of_speech",
"spacy.strings",
"spacy.lexeme",
"spacy.vocab",
"spacy.attrs",
"spacy.kb.candidate",
"spacy.kb.kb",
"spacy.kb.kb_in_memory",
"spacy.ml.parser_model",
"spacy.morphology",
"spacy.pipeline.dep_parser",
"spacy.pipeline._edit_tree_internals.edit_trees",
"spacy.pipeline.morphologizer",
"spacy.pipeline.multitask",
"spacy.pipeline.ner",
"spacy.pipeline.pipe",
"spacy.pipeline.trainable_pipe",
"spacy.pipeline.sentencizer",
"spacy.pipeline.senter",
"spacy.pipeline.tagger",
"spacy.pipeline.transition_parser",
"spacy.pipeline._parser_internals.arc_eager",
"spacy.pipeline._parser_internals.ner",
"spacy.pipeline._parser_internals.nonproj",
"spacy.pipeline._parser_internals._state",
"spacy.pipeline._parser_internals.stateclass",
"spacy.pipeline._parser_internals.transition_system",
"spacy.pipeline._parser_internals._beam_utils",
"spacy.tokenizer",
"spacy.training.align",
"spacy.training.gold_io",
"spacy.tokens.doc",
"spacy.tokens.span",
"spacy.tokens.token",
"spacy.tokens.span_group",
"spacy.tokens.graph",
"spacy.tokens.morphanalysis",
"spacy.tokens._retokenize",
"spacy.matcher.matcher",
"spacy.matcher.phrasematcher",
"spacy.matcher.dependencymatcher",
"spacy.symbols",
"spacy.vectors",
"blis",
"blis.py",
"cymem",
"cymem.cymem",
"murmurhash",
"preshed.maps",
"srsly.msgpack.util",
"thinc.extra.search",
"thinc.linalg",
"thinc.neural._aligned_alloc",
"thinc.neural._custom_kernels",
]

0 comments on commit b713a5a

Please sign in to comment.