forked from paracrawl/cirrus-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
09.clean
executable file
·57 lines (50 loc) · 1.59 KB
/
09.clean
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/bin/bash
set -euo pipefail
shopt -s extglob
collection=$1
lang=$2
target_lang_data=$3
batch=$4
shard=$(basename $(dirname $batch))
TMPSFX=${JOB_ID:-$$}
FIXED=$batch/fixed.gz
SCORED=$batch/scored.gz
CLASSIFIED=$batch/classified.gz
FILTERED=$batch/filtered${BICLEANER_THRESHOLD/./}.gz
STATS=$batch/stats.txt
test -r $FIXED
test -r $SCORED
# Columns in fixed.gz
# 1. target url
# 2. source url
# 3. target sentence
# 4. source sentence
# 5. target checksum
# 6. source checksum
# 7. bifixer hash (indicate similar sentence pairs)
# 8. bifixer score (indicate which of the similar sentence pairs is the best)
filter_takedowns() {
# Use fgrep to remove lines that match anything in filtered-terms.txt. Make
# sure to remove any trailing newline from filtered-terms.txt because
# otehrwise that will match everything.
fgrep -ivf <(sed -z '$ s/\n$//' < ${SCRIPTS}/filtered-terms.txt) || true
}
paste <(pigz -dc $FIXED) <(pigz -dc $SCORED) `# add bicleaner score as the 9th column`\
| filter_takedowns \
| sed -e "s/$/\t${collection}/" `# add the collection as the 10th column`\
| tee \
>(pigz -9c > $CLASSIFIED.$TMPSFX) \
>(wc -wl | sed 's/^ \+//' | tr -s ' ' '\t' > $STATS.$TMPSFX) \
| awk -F"\t" "\$9 >= ${BICLEANER_THRESHOLD}" \
| python3 $BITEXTOR/bitextor-elrc-filtering.py -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection" -s \
| LC_ALL=C sort -t$'\t' -k7,7 -k8,8nr \
| pigz -9c \
> $FILTERED.$TMPSFX \
|| {
echo "Error in pipeline: ${PIPESTATUS[@]}"
exit 1
}
mv $CLASSIFIED.$TMPSFX $CLASSIFIED
mv $FILTERED.$TMPSFX $FILTERED
mv $STATS.$TMPSFX $STATS
#rm fixed.gz