-
Notifications
You must be signed in to change notification settings - Fork 45
/
package_st_data.sh
executable file
·215 lines (201 loc) · 10.7 KB
/
package_st_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/bin/bash
# Scans the contents of the release folder and copies the shared task files into folders that should appear in TIRA.
UDPATH=/net/work/people/zeman/unidep
SRCREL=$UDPATH/release-2.2/ud-treebanks-v2.2
# SRCMOR: UDPipe morphology, gold segmentation and syntax. This has been released in Lindat.
# SRCPSM: UDPipe segmentation and morphology, no syntax. Preprocessing for participants who do not want to do this on their own.
# This folder contains a pipeline that Milan created for this. The script process-tira-devel-trial.sh runs it on devel and trial.
# The script takes its input from my $DST folder (see below), thus it must be re-run if we change the data here. But right now
# the results are there, in folders development and trial, called code-udpipe.conllu.
SRCMOR=/net/work/people/zeman/unidep/ud-2.2-conll18-crossfold-morphology
SRCPSM=/net/work/people/zeman/unidep/tira-data-2018-psegmorfile
###!!! Skip files preprocessed by UDPipe until Milan prepares the files from UD 2.2.
###!!! Setting the variable SKIPUDPIPE to any non-empty value will do it.
###!!! Comment it out when ready.
#SKIPUDPIPE=yes
DSTFOLDER=data-for-tira
DST=$UDPATH/$DSTFOLDER
# Expected folder structure at TIRA: Datasets are mounted under /media/*.
# The top-level subfolders determine where, when and how the datasets are
# accessible:
# - training-datasets
# - training-datasets-truth
# - test-datasets
# - test-datasets-truth
# Each of these folders has subfolders for all shared tasks hosted at TIRA.
# Ours is called "universal-dependency-learning", its subfolders could be
# datasets of various years/rounds; we currently have CoNLL 2017 and 2018.
# The subfolders of the 2017 task are:
# - conll17-ud-trial-2017-03-19 (in training-datasets ==? training-datasets-truth; for ar, en, tr, vi, zh)
# - conll17-ud-development-2017-03-19 (in training-datasets ==? training-datasets-truth)
# - conll17-ud-test-2017-05-09 (in test-datasets and test-datasets-truth)
# So we should now create for 2018:
# - conll18-ud-trial-2018-04-15 (in training-datasets)
# - conll18-ud-development-2018-04-15
# - conll18-ud-test-2018-04-15 (in test-datasets and test-datasets-truth)
# /media/training-datasets: Only this folder can be accessed by the participants.
# It contains the training data (just in case they want to train something
# here?), the development data (participants can access the development data
# directly from their virtual machine, and they can run their system through
# the web interface with the development data as input), and trial data
# (a small subset of the development data, used for fast debugging).
# These datasets may include the gold standard so that the participants can
# access it when they experiment in their virtual machine; but a copy of the
# gold standard must also exist in the "truth" folder mentioned below.
# /media/training-datasets-truth: Not accessible by the participants, although
# the other tasks seem to have just a copy of the training and development
# data here. This is probably the place where the evaluation script takes
# the gold standard from.
# /media/test-datasets: Blind test data used as input for the systems. The
# participants normally do not see it, their system only can access it when
# the virtual machine is "sandboxed", to prevent information leaks.
# Of course this folder must not contain gold-standard data.
# /media/test-datasets-truth: Gold standard test data used by the evaluation
# script.
TASK="universal-dependency-learning/conll18-ud"
DATE="2018-05-06"
DSTTRAINI="$DST/training-datasets/$TASK-training-$DATE"
DSTTRAING="$DST/training-datasets-truth/$TASK-training-$DATE"
DSTDEVI="$DST/training-datasets/$TASK-development-$DATE"
DSTDEVG="$DST/training-datasets-truth/$TASK-development-$DATE"
DSTTRIALI="$DST/training-datasets/$TASK-trial-$DATE"
DSTTRIALG="$DST/training-datasets-truth/$TASK-trial-$DATE"
DSTTESTI="$DST/test-datasets/$TASK-test-$DATE"
DSTTESTG="$DST/test-datasets-truth/$TASK-test-$DATE"
# Generate the folders.
rm -rf $DST
mkdir -p $DSTTRAINI
mkdir -p $DSTTRAING
mkdir -p $DSTDEVI
mkdir -p $DSTDEVG
mkdir -p $DSTTRIALI
mkdir -p $DSTTRIALG
mkdir -p $DSTTESTI
mkdir -p $DSTTESTG
echo code.conllu .......... gold standard data > $DSTTRAINI/README.txt
echo code-pmor.conllu ..... gold segmentation and syntax, predicted morphology >> $DSTTRAINI/README.txt
echo code-udpipe.conllu ... predicted segmentation and morphology, no syntax >> $DSTTRAINI/README.txt
echo code.txt ............. raw text input >> $DSTTRAINI/README.txt
cp $DSTTRAINI/README.txt $DSTDEVI/README.txt
echo Small subset of the development data, intended for debugging. > $DSTTRIALI/README.txt
echo code-udpipe.conllu ... predicted segmentation and morphology, no syntax >> $DSTTRIALI/README.txt
echo code.txt ............. raw text input >> $DSTTRIALI/README.txt
echo code-udpipe.conllu ... predicted segmentation and morphology, no syntax > $DSTTESTI/README.txt
echo code.txt ............. raw text input >> $DSTTESTI/README.txt
# In the folders that the system can get as input, we will create metadata.json.
# The system should use it to identify what files it is supposed to process and
# how. Metadata fields:
cat << EOF > $DST/README-metadata.txt
The system must read metadata.json to obtain the list of test sets it is
supposed to process. Each test set has a record in the list, and the fields in
the record are interpreted as follows:
* lcode ..... language code (for UD languages same as in UD; but other
languages may appear here too)
* tcode ..... treebank code (for UD treebanks same as in UD; but extra non-UD
treebanks may appear here too)
* rawfile ... name of raw text file (input of systems that do their own
segmentation)
* psegmorfile ... name of CoNLL-U file with segmentation and morphology
predicted by a baseline system (currently UDPipe)
* outfile ... name of the corresponding CoNLL-U file that the system must
generate in the output folder
Extra fields not needed by the participating system:
* goldfile ... name of the corresponding gold-standard file to be used by the
evaluation script (in a separate folder)
EOF
cat $DST/README-metadata.txt >> $DSTDEVI/README.txt
cat $DST/README-metadata.txt >> $DSTTRIALI/README.txt
cat $DST/README-metadata.txt >> $DSTTESTI/README.txt
rm $DST/README-metadata.txt
# Copy the data to the folders.
cd $SRCREL
echo '[' > $DSTDEVI/metadata.json
echo '[' > $DSTTRIALI/metadata.json
echo '[' > $DSTTESTI/metadata.json
for i in UD_* ; do
# All treebanks should have test data even if they do not have training or
# development data.
ltcode=$(ls $i | grep test.conllu | perl -pe 's/-ud-test\.conllu$//')
lcode=$(echo $ltcode | perl -pe 's/_.*//')
tcode=$(echo $ltcode | perl -pe 'if(m/_(.+)/) {$_=$1} else {$_=0}')
echo `pwd`/$i '-->' $ltcode
# Some small treebanks do not have any training set.
if [ -f $i/$ltcode-ud-train.conllu ] ; then
chmod 644 $i/$ltcode-ud-train.conllu
cp $i/$ltcode-ud-train.conllu $DSTTRAING/$ltcode.conllu
cp $i/$ltcode-ud-train.conllu $DSTTRAINI/$ltcode.conllu
if [ -z "$SKIPUDPIPE" ] ; then
cp $SRCMOR/$i/$ltcode-ud-train.conllu $DSTTRAINI/$ltcode-pmor.conllu
fi
cp $i/$ltcode-ud-train.txt $DSTTRAINI/$ltcode.txt
fi
# Some small treebanks do not have any dev set.
if [ -f $i/$ltcode-ud-dev.conllu ] ; then
if [ -z "$firstdev" ] ; then
firstdev="nolonger"
else
echo , >> $DSTDEVI/metadata.json
fi
echo -n ' {"lcode":"'$lcode'", "tcode":"'$tcode'", "rawfile":"'$ltcode'.txt", "psegmorfile":"'$ltcode'-udpipe.conllu", "outfile":"'$ltcode'.conllu", "goldfile":"'$ltcode'.conllu"}' >> $DSTDEVI/metadata.json
chmod 644 $i/$ltcode-ud-dev.conllu
cp $i/$ltcode-ud-dev.conllu $DSTDEVG/$ltcode.conllu
cp $i/$ltcode-ud-dev.conllu $DSTDEVI/$ltcode.conllu
if [ -z "$SKIPUDPIPE" ] ; then
cp $SRCMOR/$i/$ltcode-ud-dev.conllu $DSTDEVI/$ltcode-pmor.conllu
cp $SRCPSM/training-datasets/universal-dependency-learning/conll18-ud-development-2018-04-15/$ltcode-udpipe.conllu $DSTDEVI/$ltcode-udpipe.conllu
fi
cp $i/$ltcode-ud-dev.txt $DSTDEVI/$ltcode.txt
fi
# Create a trial dataset for debugging purposes.
# Unlike the development data, we will not provide the gold-standard file in the input folder.
# The purpose is to make the setting as similar to the test data as possible.
if [ "$i" = "UD_English-GUM" ] || [ "$i" = "UD_Czech-PDT" ] || [ "$i" = "UD_Arabic-PADT" ] || [ "$i" = "UD_Chinese-GSD" ] || [ "$i" = "UD_Swedish-Talbanken" ] ; then
if [ -z "$firsttrial" ] ; then
firsttrial="nolonger"
else
echo , >> $DSTTRIALI/metadata.json
fi
echo -n ' {"lcode":"'$lcode'", "tcode":"'$tcode'", "rawfile":"'$ltcode'.txt", "psegmorfile":"'$ltcode'-udpipe.conllu", "outfile":"'$ltcode'.conllu", "goldfile":"'$ltcode'.conllu"}' >> $DSTTRIALI/metadata.json
split_conll.pl -head 50 < $i/$ltcode-ud-dev.conllu $DSTTRIALG/$ltcode.conllu /dev/null
../../tools/conllu_to_text.pl --lang $lcode < $DSTTRIALG/$ltcode.conllu > $DSTTRIALI/$ltcode.txt
if [ -z "$SKIPUDPIPE" ] ; then
cp $SRCPSM/training-datasets/universal-dependency-learning/conll18-ud-trial-2018-04-15/$ltcode-udpipe.conllu $DSTTRIALI/$ltcode-udpipe.conllu
fi
fi
# Copy the test data. All treebanks should have test data even if they do not
# have training or development data.
if [ -z "$firsttest" ] ; then
firsttest="nolonger"
else
echo , >> $DSTTESTI/metadata.json
fi
echo -n ' {"lcode":"'$lcode'", "tcode":"'$tcode'", "rawfile":"'$ltcode'.txt", "psegmorfile":"'$ltcode'-udpipe.conllu", "outfile":"'$ltcode'.conllu", "goldfile":"'$ltcode'.conllu"}' >> $DSTTESTI/metadata.json
chmod 644 $i/$ltcode-ud-test.conllu
cp $i/$ltcode-ud-test.conllu $DSTTESTG/$ltcode.conllu
cp $i/$ltcode-ud-test.txt $DSTTESTI/$ltcode.txt
if [ -z "$SKIPUDPIPE" ] ; then
cp $SRCPSM/test-datasets/universal-dependency-learning/conll18-ud-test-2018-04-15/$ltcode-udpipe.conllu $DSTTESTI/$ltcode-udpipe.conllu
fi
done
echo >> $DSTDEVI/metadata.json
echo >> $DSTTRIALI/metadata.json
echo >> $DSTTESTI/metadata.json
echo ']' >> $DSTDEVI/metadata.json
echo ']' >> $DSTTRIALI/metadata.json
echo ']' >> $DSTTESTI/metadata.json
cp $DSTDEVI/metadata.json $DSTDEVG/metadata.json
cp $DSTTRIALI/metadata.json $DSTTRIALG/metadata.json
cp $DSTTESTI/metadata.json $DSTTESTG/metadata.json
###!!! For later updates we may want to generate only selected data sets.
###!!! The easiest approach is probably to generate everything and then remove
###!!! the superfluous data sets here.
#rm -rf $DSTTRAINI
#rm -rf $DSTTRAING
#rm -rf $DSTDEVI
#rm -rf $DSTDEVG
#rm -rf $DSTTRIALI
#rm -rf $DSTTRIALG
cd $DST/..
rm tira.zip
cd $DST
zip -r ../tira.zip *