Skip to content

Commit

Permalink
Add HTTP downloader for tatoeba
Browse files Browse the repository at this point in the history
  • Loading branch information
eumiro committed Jan 17, 2021
1 parent 281e7c1 commit 55b11b0
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 3 deletions.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"Topic :: Education",
"Topic :: Education :: Computer Aided Instruction (CAI)",
]
INSTALL_REQUIRES = ["xdg"]
INSTALL_REQUIRES = ["python-dateutil", "requests", "xdg"]

# --+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----

Expand Down Expand Up @@ -94,7 +94,7 @@
zip_safe=False,
classifiers=CLASSIFIERS,
install_requires=INSTALL_REQUIRES,
extras_require={"test": ["pytest"]},
extras_require={"test": ["pytest", "coverage", "requests-mock"]},
options={},
include_package_data=True,
entry_points={
Expand Down
43 changes: 43 additions & 0 deletions src/lumipallo/corpus.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""Corpus containing all sentences and links to translations."""

import datetime as dt
import typing
from pathlib import Path

import requests
import xdg
from dateutil import parser


class Corpus:
Expand All @@ -14,3 +17,43 @@ def __init__(self, data_dir: typing.Union[Path, None] = None):
data_dir = xdg.xdg_data_home()
self.path = data_dir / "lumipallo"
self.path.mkdir(parents=True, exist_ok=True)

def _sync_tatoeba(self, local_path: Path, url: str) -> bool:
print(f"Checking {local_path} ")
now = dt.datetime.now(dt.timezone.utc)
too_old = now - dt.timedelta(days=7)
if local_path.exists():
mtime = dt.datetime.fromtimestamp(
local_path.stat().st_mtime, dt.timezone.utc
)
if too_old < mtime:
return False
res = requests.head(url)
res.raise_for_status()
if parser.parse(res.headers["Last-Modified"]) < too_old:
return False
print(f"Downloading {url} ...", end="")
res = requests.get(url)
res.raise_for_status()
local_path.write_bytes(res.content)
print("OK")
return True

def update_tatoeba(
self,
src_langs: typing.List[str],
target_langs: typing.List[str],
local_root: Path,
) -> None: # pragma: no cover
url_root = "https://downloads.tatoeba.org/exports/per_language/"

for lang in src_langs + target_langs:
url_sentences = f"{url_root}/{lang}/{lang}_sentences.tsv.bz2"
local_path = local_root / f"{lang}_sentences.tsv.bz2"
self._sync_tatoeba(local_path, url_sentences)

for src in src_langs:
for target in target_langs:
url_links = f"{url_root}/{target}/{target}-{src}_links.tsv.bz2"
local_path = local_root / f"{target}-{src}_links.tsv.bz2"
self._sync_tatoeba(local_path, url_links)
65 changes: 65 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""Test corpus.py"""
import datetime as dt

from lumipallo.corpus import Corpus


Expand All @@ -14,3 +16,66 @@ def test_corpus_empty(tmp_path):
corpus = Corpus(data_dir=tmp_path)
assert corpus.path.is_dir()
assert not list(corpus.path.iterdir())


def test_download_tatoeba_sentences_fresh_ok(requests_mock, tmp_path):
"""Test corpus downloads sentences from tatoeba"""
now = dt.datetime.now(dt.timezone.utc)
url = (
"https://downloads.tatoeba.org"
"/exports/per_language/deu/deu_sentences.tsv.bz2"
)
headers = {"Last-Modified": (now - dt.timedelta(days=1)).isoformat()}
content = b"hello world"
requests_mock.head(url, headers=headers)
requests_mock.get(url, content=content)

corpus = Corpus(data_dir=tmp_path)
local_path = corpus.path / "deu_sentences.tsv.bz2"
res = corpus._sync_tatoeba(local_path, url)
assert requests_mock.call_count == 2
assert res
assert local_path.is_file()
assert local_path.read_bytes() == content


def test_download_tatoeba_sentences_not_needed(requests_mock, tmp_path):
"""Test corpus does not download sentences from tatoeba if local"""
now = dt.datetime.now(dt.timezone.utc)
url = (
"https://downloads.tatoeba.org"
"/exports/per_language/deu/deu_sentences.tsv.bz2"
)
headers = {"Last-Modified": (now - dt.timedelta(days=1)).isoformat()}
content = b"hello world"
requests_mock.head(url, headers=headers)
requests_mock.get(url, content=content)

corpus = Corpus(data_dir=tmp_path)
local_path = corpus.path / "deu_sentences.tsv.bz2"
local_path.write_bytes(content)
res = corpus._sync_tatoeba(local_path, url)
assert requests_mock.call_count == 0
assert not res
assert local_path.is_file()
assert local_path.read_bytes() == content


def test_download_tatoeba_sentences_tooold(requests_mock, tmp_path):
"""Test corpus does not downloads old sentences from tatoeba"""
now = dt.datetime.now(dt.timezone.utc)
url = (
"https://downloads.tatoeba.org"
"/exports/per_language/deu/deu_sentences.tsv.bz2"
)
headers = {"Last-Modified": (now - dt.timedelta(days=10)).isoformat()}
content = b"hello world"
requests_mock.head(url, headers=headers)
requests_mock.get(url, content=content)

corpus = Corpus(data_dir=tmp_path)
local_path = corpus.path / "deu_sentences.tsv.bz2"
res = corpus._sync_tatoeba(local_path, url)
assert requests_mock.call_count == 1
assert not res
assert not local_path.is_file()
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ max-line-length = 79
ignore = E231 # clashes with black

[isort]
known_third_party = pytest,setuptools
known_third_party = requests,xdg,pytest,setuptools

[gh-actions]
python =
Expand Down

0 comments on commit 55b11b0

Please sign in to comment.