Skip to content

Commit

Permalink
feat: Update base image to python:3.9
Browse files Browse the repository at this point in the history
fix failing test (random ordering of dates)

fix and update tesserocr installation
  • Loading branch information
stchris committed Nov 21, 2024
1 parent 43d32e0 commit 09ee9cb
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 134 deletions.
256 changes: 127 additions & 129 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,162 +1,160 @@
FROM ubuntu:20.04
FROM python:3.9
ENV DEBIAN_FRONTEND noninteractive

LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
LABEL org.opencontainers.image.licenses MIT
LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file

# Enable non-free archive for `unrar`.
# RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list
RUN apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales ca-certificates \
# git
git \
# python deps (mostly to install their dependencies)
python3-pip python3-dev python3-pil \
# tesseract
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\
# libraries
libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
zlib1g-dev libicu-dev libxml2-dev \
# package tools
unrar p7zip-full \
# audio & video metadata
libmediainfo-dev \
# image processing, djvu
imagemagick-common imagemagick mdbtools djvulibre-bin \
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
pst-utils \
### tesseract
tesseract-ocr-eng \
tesseract-ocr-swa \
tesseract-ocr-swe \
# tesseract-ocr-tam \
# tesseract-ocr-tel \
tesseract-ocr-fil \
# tesseract-ocr-tha \
tesseract-ocr-tur \
tesseract-ocr-ukr \
# tesseract-ocr-vie \
tesseract-ocr-nld \
tesseract-ocr-nor \
tesseract-ocr-pol \
tesseract-ocr-por \
tesseract-ocr-ron \
tesseract-ocr-rus \
tesseract-ocr-slk \
tesseract-ocr-slv \
tesseract-ocr-spa \
# tesseract-ocr-spa_old \
tesseract-ocr-sqi \
tesseract-ocr-srp \
tesseract-ocr-ind \
tesseract-ocr-isl \
tesseract-ocr-ita \
# tesseract-ocr-ita_old \
# tesseract-ocr-jpn \
tesseract-ocr-kan \
tesseract-ocr-kat \
# tesseract-ocr-kor \
tesseract-ocr-khm \
tesseract-ocr-lav \
tesseract-ocr-lit \
# tesseract-ocr-mal \
tesseract-ocr-mkd \
tesseract-ocr-mya \
tesseract-ocr-mlt \
tesseract-ocr-msa \
tesseract-ocr-est \
# tesseract-ocr-eus \
tesseract-ocr-fin \
tesseract-ocr-fra \
tesseract-ocr-frk \
# tesseract-ocr-frm \
# tesseract-ocr-glg \
# tesseract-ocr-grc \
tesseract-ocr-heb \
tesseract-ocr-hin \
tesseract-ocr-hrv \
tesseract-ocr-hye \
tesseract-ocr-hun \
# tesseract-ocr-ben \
tesseract-ocr-bul \
tesseract-ocr-cat \
tesseract-ocr-ces \
tesseract-ocr-nep \
# tesseract-ocr-chi_sim \
# tesseract-ocr-chi_tra \
# tesseract-ocr-chr \
tesseract-ocr-dan \
tesseract-ocr-deu \
tesseract-ocr-ell \
# tesseract-ocr-enm \
# tesseract-ocr-epo \
# tesseract-ocr-equ \
tesseract-ocr-afr \
tesseract-ocr-ara \
tesseract-ocr-aze \
tesseract-ocr-bel \
tesseract-ocr-uzb \
### pdf convert: libreoffice + a bunch of fonts
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
fonts-tlwg-purisa \
###
&& apt-get -qq -y autoremove \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
&& apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales ca-certificates \
# git
git \
# python deps (mostly to install their dependencies)
python3-dev \
# tesseract
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\
# libraries
libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
zlib1g-dev libicu-dev libxml2-dev \
# package tools
unrar p7zip-full \
# audio & video metadata
libmediainfo-dev \
# image processing, djvu
imagemagick-common imagemagick mdbtools djvulibre-bin \
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
pst-utils libopenjp2-7-dev libgif-dev libpng-dev \
### tesseract
tesseract-ocr-eng \
tesseract-ocr-swa \
tesseract-ocr-swe \
# tesseract-ocr-tam \
# tesseract-ocr-tel \
tesseract-ocr-fil \
# tesseract-ocr-tha \
tesseract-ocr-tur \
tesseract-ocr-ukr \
# tesseract-ocr-vie \
tesseract-ocr-nld \
tesseract-ocr-nor \
tesseract-ocr-pol \
tesseract-ocr-por \
tesseract-ocr-ron \
tesseract-ocr-rus \
tesseract-ocr-slk \
tesseract-ocr-slv \
tesseract-ocr-spa \
# tesseract-ocr-spa_old \
tesseract-ocr-sqi \
tesseract-ocr-srp \
tesseract-ocr-ind \
tesseract-ocr-isl \
tesseract-ocr-ita \
# tesseract-ocr-ita_old \
# tesseract-ocr-jpn \
tesseract-ocr-kan \
tesseract-ocr-kat \
# tesseract-ocr-kor \
tesseract-ocr-khm \
tesseract-ocr-lav \
tesseract-ocr-lit \
# tesseract-ocr-mal \
tesseract-ocr-mkd \
tesseract-ocr-mya \
tesseract-ocr-mlt \
tesseract-ocr-msa \
tesseract-ocr-est \
# tesseract-ocr-eus \
tesseract-ocr-fin \
tesseract-ocr-fra \
tesseract-ocr-frk \
# tesseract-ocr-frm \
# tesseract-ocr-glg \
# tesseract-ocr-grc \
tesseract-ocr-heb \
tesseract-ocr-hin \
tesseract-ocr-hrv \
tesseract-ocr-hye \
tesseract-ocr-hun \
# tesseract-ocr-ben \
tesseract-ocr-bul \
tesseract-ocr-cat \
tesseract-ocr-ces \
tesseract-ocr-nep \
# tesseract-ocr-chi_sim \
# tesseract-ocr-chi_tra \
# tesseract-ocr-chr \
tesseract-ocr-dan \
tesseract-ocr-deu \
tesseract-ocr-ell \
# tesseract-ocr-enm \
# tesseract-ocr-epo \
# tesseract-ocr-equ \
tesseract-ocr-afr \
tesseract-ocr-ara \
tesseract-ocr-aze \
tesseract-ocr-bel \
tesseract-ocr-uzb \
### pdf convert: libreoffice + a bunch of fonts
libreoffice fonts-opensymbol hyphen-fr hyphen-de \
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
fonts-tlwg-purisa \
###
&& apt-get -qq -y autoremove \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

# Set up the locale and make sure the system uses unicode for the file system.
ENV LANG='en_US.UTF-8' \
TZ='UTC' \
OMP_THREAD_LIMIT='1' \
OPENBLAS_NUM_THREADS='1'
TZ='UTC' \
OMP_THREAD_LIMIT='1' \
OPENBLAS_NUM_THREADS='1'

RUN groupadd -g 1000 -r app \
&& useradd -m -u 1000 -s /bin/false -g app app
&& useradd -m -u 1000 -s /bin/false -g app app

# Download the ftm-typepredict model
RUN mkdir /models/ && \
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"

COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt

# Install spaCy models
RUN python3 -m spacy download en_core_web_sm \
&& python3 -m spacy download de_core_news_sm \
&& python3 -m spacy download fr_core_news_sm \
&& python3 -m spacy download es_core_news_sm
&& python3 -m spacy download de_core_news_sm \
&& python3 -m spacy download fr_core_news_sm \
&& python3 -m spacy download es_core_news_sm
RUN python3 -m spacy download ru_core_news_sm \
&& python3 -m spacy download pt_core_news_sm \
&& python3 -m spacy download ro_core_news_sm \
&& python3 -m spacy download mk_core_news_sm
&& python3 -m spacy download pt_core_news_sm \
&& python3 -m spacy download ro_core_news_sm \
&& python3 -m spacy download mk_core_news_sm
RUN python3 -m spacy download el_core_news_sm \
&& python3 -m spacy download pl_core_news_sm \
&& python3 -m spacy download it_core_news_sm \
&& python3 -m spacy download lt_core_news_sm \
&& python3 -m spacy download nl_core_news_sm \
&& python3 -m spacy download nb_core_news_sm \
&& python3 -m spacy download da_core_news_sm
&& python3 -m spacy download pl_core_news_sm \
&& python3 -m spacy download it_core_news_sm \
&& python3 -m spacy download lt_core_news_sm \
&& python3 -m spacy download nl_core_news_sm \
&& python3 -m spacy download nb_core_news_sm \
&& python3 -m spacy download da_core_news_sm
# RUN python3 -m spacy download zh_core_web_sm

COPY . /ingestors
WORKDIR /ingestors
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN chown -R app:app /ingestors

ENV ARCHIVE_TYPE=file \
ARCHIVE_PATH=/data \
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
REDIS_URL=redis://redis:6379/0 \
TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata \
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
ARCHIVE_PATH=/data \
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
REDIS_URL=redis://redis:6379/0 \
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1

# USER app
CMD ingestors process
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ format-check:
black --check .

test: services
$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
PYTHONDEVMODE=1 PYTHONTRACEMALLOC=1 $(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term

restart: build
$(COMPOSE) up --force-recreate --no-deps --detach ingest-file
Expand Down
2 changes: 0 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: "3.2"

services:
postgres:
image: postgres:10.0
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ languagecodes==1.1.1
countrytagger==0.1.2
pyicu==2.12
google-cloud-vision==3.7.2
tesserocr==2.6.2
tesserocr==2.7.1
spacy==3.6.1
numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy
fingerprints==1.1.1
fasttext==0.9.2
pika==1.3.2
Expand Down
2 changes: 1 addition & 1 deletion tests/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_audio(self):
self.assertEqual(entity.first("generator"), "com.apple.VoiceMemos (iOS 11.4)")
self.assertEqual(
entity.first("authoredAt"),
datetime.datetime(2018, 6, 20, 12, 9, 42).isoformat(),
datetime.datetime(2018, 6, 20, 12, 9, 28).isoformat(),
)
self.assertEqual(entity.first("duration"), "2808")
self.assertEqual(entity.first("samplingRate"), "44100")
Expand Down

0 comments on commit 09ee9cb

Please sign in to comment.