-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_vocabulary.sh
70 lines (57 loc) · 1.56 KB
/
generate_vocabulary.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/bash
while getopts 's:t:h' opt; do
case "$opt" in
s)
SRC_TOK="$OPTARG"
echo "Processing option 's' with '${OPTARG}' argument"
;;
t)
TGT_TOK="$OPTARG"
echo "Processing option 't' with '${OPTARG}' argument"
;;
?|h)
echo "Usage: $(basename $0) [-s <tokenized_source_data_path>] [-t <tokenized_target_data_path]"
exit 1
;;
esac
done
shift "$(($OPTIND -1))"
set -e
echo $SRC_TOK
echo $TGT_TOK
TOOLS_PATH=$PWD/tools
DATA_PATH=$PWD/data
mkdir -p $TOOLS_PATH
mkdir -p $DATA_PATH
SRC_VOCAB=$DATA_PATH/vocab.yue
TGT_VOCAB=$DATA_PATH/vocab.zh
FULL_VOCAB=$DATA_PATH/vocab.yue-zh
# fastBPE dir
FASTBPE_DIR=$TOOLS_PATH/fastBPE
FASTBPE=$FASTBPE_DIR/fast
# Download fastBPE
cd $TOOLS_PATH
if [ ! -d "$FASTBPE_DIR" ]; then
echo "Cloning fastBPE from GitHub repository..."
git clone https://github.com/glample/fastBPE
fi
echo "fastBPE found in: $FASTBPE_DIR"
# Compile fastBPE
cd $TOOLS_PATH
if [ ! -f "$FASTBPE" ]; then
echo "Compiling fastBPE..."
cd $FASTBPE_DIR
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
fi
echo "fastBPE compiled in: $FASTBPE"
# extract vocabulary
# if this fails, it can be run manually after the code above has successfully completed
if ! [[ -f "$SRC_VOCAB" && -f "$TGT_VOCAB" && -f "$FULL_VOCAB" ]]; then
echo "Extracting vocabulary..."
$FASTBPE getvocab $SRC_TOK > $SRC_VOCAB
$FASTBPE getvocab $TGT_TOK > $TGT_VOCAB
$FASTBPE getvocab $SRC_TOK $TGT_TOK > $FULL_VOCAB
fi
echo "YUE vocab in: $SRC_VOCAB"
echo "ZH vocab in: $TGT_VOCAB"
echo "Full vocab in: $FULL_VOCAB"