forked from kermitt2/datastet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile.datastet
100 lines (76 loc) · 3.72 KB
/
Dockerfile.datastet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# -------------------
# build builder image
# -------------------
FROM openjdk:17-jdk-slim as builder
USER root
RUN apt-get update && \
apt-get -y --no-install-recommends install apt-utils libxml2 git unzip wget
WORKDIR /opt/grobid
RUN mkdir -p datastet-source grobid-home/models
COPY src datastet-source/src
COPY settings.gradle datastet-source/
COPY resources/config/config-docker.yml datastet-source/resources/config/config.yml
COPY resources/models datastet-source/resources/models
COPY resources/lexicon datastet-source/resources/lexicon
COPY build.gradle datastet-source/
COPY gradle.properties datastet-source/
COPY gradle datastet-source/gradle/
COPY gradlew datastet-source/
#COPY .git datastet-source/.git
#COPY localLibs datastet-source/localLibs
# Preparing models
WORKDIR /opt/grobid/datastet-source
RUN rm -rf /opt/grobid/grobid-home/models/*
RUN ./gradlew clean assemble -x shadowJar --no-daemon --stacktrace --info
RUN ./gradlew installModels --no-daemon --info --stacktrace \
&& rm -f /opt/grobid/grobid-home/models/*.zip
# Preparing distribution
WORKDIR /opt/grobid
RUN unzip -o /opt/grobid/datastet-source/build/distributions/datastet-*.zip -d datastet_distribution \
&& mv datastet_distribution/datastet-* datastet \
&& rm -rf /opt/grobid/datastet-source/build
# install Pub2TEI
WORKDIR /opt/
RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip && \
unzip master.zip && \
mv Pub2TEI-master Pub2TEI && \
rm master.zip
# -------------------
# build runtime image
# -------------------
FROM lfoppiano/grobid:0.8.1-full as runtime
# setting locale is likely useless but to be sure
ENV LANG C.UTF-8
WORKDIR /opt/grobid
RUN rm -rf /opt/grobid/grobid-home/models/*-with_ELMo \
&& rm -rf /opt/grobid/grobid-service \
&& ln -sf datastet/resources/ resources
# the last command above is just a hack to make the lexicon loader working
COPY --from=builder /opt/grobid/grobid-home/models ./grobid-home/models
COPY --from=builder /opt/grobid/datastet ./datastet/
COPY --from=builder /opt/grobid/datastet-source/resources/config/config.yml ./datastet/resources/config/
COPY --from=builder /opt/grobid/datastet-source/resources/lexicon/ ./datastet/resources/lexicon/
COPY --from=builder /opt/grobid/datastet /opt/grobid/datastet
COPY --from=builder /opt/Pub2TEI /opt/Pub2TEI
VOLUME ["/opt/grobid/grobid-home/tmp"]
#WORKDIR /opt/grobid
# install ELMo
#RUN wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json
#RUN wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5
#RUN mkdir /opt/elmo
#RUN mv elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json /opt/elmo/
#RUN mv elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5 /opt/elmo/
# this will build and load embeddings on the image forever (only if required by the config) :)
# LF: AFAIK this is not needed at the moment as all the models are running with bert, but might
# be a solution if we want to support the GRU version
# RUN python3 preload_embeddings.py --registry ./resources-registry.json --embedding word2vec
ARG GROBID_VERSION
ENV GROBID_VERSION=${GROBID_VERSION:-latest}
ENV DATASTET_OPTS "-Djava.library.path=/opt/grobid/grobid-home/lib/lin-64:/usr/local/lib/python3.8/dist-packages/jep --add-opens java.base/java.lang=ALL-UNNAMED"
CMD ["./datastet/bin/datastet", "server", "datastet/resources/config/config.yml"]
LABEL \
authors="The contributors" \
org.label-schema.name="datastet" \
org.label-schema.description="Image with DataStet service" \
org.label-schema.url="https://github.com/DataSeer/datastet" \
org.label-schema.version=${GROBID_VERSION}