added info on Cat-eng models

hplt-project · Sep 27, 2023 · a2f3229 · a2f3229
1 parent 81caf68
commit a2f3229
Show file tree

Hide file tree

Showing 8 changed files with 203 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@ Configuration files could include:
 # Download models:
 https://object.pouta.csc.fi/hplt_bitextor_models/ara_base.tar.gz \
 https://object.pouta.csc.fi/hplt_bitextor_models/ara_tiny.tar.gz \
+https://object.pouta.csc.fi/hplt_bitextor_models/ca-en_exported_base.zip \
+https://object.pouta.csc.fi/hplt_bitextor_models/ca-en_exported_tiny.zip \
 https://object.pouta.csc.fi/hplt_bitextor_models/eus_base.zip \
 https://object.pouta.csc.fi/hplt_bitextor_models/eus_tiny.zip \
 https://object.pouta.csc.fi/hplt_bitextor_models/gl-en_exported_base.zip \

diff --git a/cat-eng/README.md b/cat-eng/README.md
@@ -0,0 +1,22 @@
+# Bitextor Catalan-English models
+
+# Teacher
+We use an Tatoeba-MT Teacher: [Tatoeba-MT-models/cat-eng/opus+bt-2021-04-30](https://object.pouta.csc.fi/Tatoeba-MT-models/cat-eng/opus+bt-2021-04-30.zip), which has been trained on the [Tatoeba](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data) dataset and backtranslations.
+
+
+# Students
+We used the Firefox pipeline to train new students, with tiny and base architectures. For data, we used [Tatoeba](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/data), as well as the monolingual [Catalan Textual Corpus](https://zenodo.org/record/4519349) and the [Macocu](https://www.clarin.si/repository/xmlui/handle/11356/1837) dataset.
+
+# Evaluation
+BLEU scores for the teacher and the students.
+
+|Model|Flores200-devtest| Model Size | tokens/s on 1 CPU
+|---|---|---|---|
+|Teacher | 41,5 | 798M |  - |
+|Student-base | 40,3 | 41M  | 1224,97 |
+|Student-tiny | 39,4 | 17M | 2940,61 |
+
+# How to run
+1. Compile [browser-mt/marian-dev](https://github.com/browsermt/marian-dev)
+2. Each model comes with a script called `run.sh` and a configuration file `config.yml`, modify them as needed
+3. Run `bash run.sh`
diff --git a/cat-eng/students/base/download_student.sh b/cat-eng/students/base/download_student.sh
@@ -0,0 +1,3 @@
+# Download student model from Allas container
+wget https://object.pouta.csc.fi/hplt_bitextor_models/ca-en_exported_base.zip
+
diff --git a/cat-eng/students/base/ftt_config.yml b/cat-eng/students/base/ftt_config.yml
@@ -0,0 +1,63 @@
+####
+# Example of a production config
+# Change language pair, experiment name, datasets and other settings if needed
+# Training low resource languages might require more tuning of pipeline/training/configs
+###
+
+
+experiment:
+  name: opusmt
+  src: ca
+  trg: en
+  src_three_letter: cat
+  trg_three_letter: eng
+
+  #OPUS models are not ensembled, they have different vocabs anyway
+  teacher-ensemble: 1
+
+  #URL to the OPUS-MT model to use as the teacher
+  opusmt-teacher: "https://object.pouta.csc.fi/Tatoeba-MT-models/cat-eng/opus+bt-2021-04-30.zip"
+
+  #URL to the OPUS-MT model to use as the backward model
+  opusmt-backward: "https://object.pouta.csc.fi/Tatoeba-MT-models/eng-cat/opus+bt-2021-04-10.zip"
+
+  #Only specify this if model is target-multilingual
+  target-language-token: 
+
+  # path to a pretrained backward model (optional)  
+  backward-model: ""
+
+  # limits per downloaded dataset
+  mono-max-sentences-src: 100000000
+  mono-max-sentences-trg: 20000000
+
+  spm-sample-size: 2000000
+
+  # split corpus to parallelize translation
+  split-length: 2000000
+
+  best-model: perplexity
+  bicleaner:
+    default-threshold: 0.5
+    dataset-thresholds:
+      tc_Tatoeba-train-v2021-08-07.cat.eng: 0.5
+
+#TODO: extract this info straight from the OPUS model yml info file
+datasets:
+  # parallel training corpus
+  train:
+    - tc_Tatoeba-Challenge-v2021-08-07
+  # datasets to merge for validation while training. Translations by teacher
+  devtest:
+    - flores_dev
+  # datasets for evaluation
+  test:
+    - flores_devtest  
+  mono-src:
+    - custom-corpus/catext
+    - custom-corpus/macocu
+
+marian-args:
+  decoding-teacher: #added to reduce time
+    mini-batch: 64
+    beam-size: 4 
diff --git a/cat-eng/students/base/model_config.yml b/cat-eng/students/base/model_config.yml
@@ -0,0 +1,22 @@
+# Model options
+dec-cell-base-depth: 2
+dec-cell-high-depth: 1
+dec-cell: ssru
+dec-depth: 2
+dim-emb: 512
+enc-cell-depth: 1
+enc-cell: gru
+enc-depth: 6
+enc-type: bidirectional
+tied-embeddings-all: true
+transformer-decoder-autoreg: rnn
+transformer-dim-ffn: 2048
+transformer-ffn-activation: relu
+transformer-ffn-depth: 2
+transformer-guided-alignment-layer: last
+transformer-heads: 8
+transformer-postprocess: dan
+transformer-preprocess: ""
+transformer-tied-layers: []
+transformer-train-position-embeddings: false
+type: transformer
diff --git a/cat-eng/students/tiny/download_student.sh b/cat-eng/students/tiny/download_student.sh
@@ -0,0 +1,3 @@
+# Download student model from Allas container
+wget https://object.pouta.csc.fi/hplt_bitextor_models/ca-en_exported_tiny.zip
+
diff --git a/cat-eng/students/tiny/ftt_config.yml b/cat-eng/students/tiny/ftt_config.yml
@@ -0,0 +1,63 @@
+####
+# Example of a production config
+# Change language pair, experiment name, datasets and other settings if needed
+# Training low resource languages might require more tuning of pipeline/training/configs
+###
+
+
+experiment:
+  name: opusmt
+  src: ca
+  trg: en
+  src_three_letter: cat
+  trg_three_letter: eng
+
+  #OPUS models are not ensembled, they have different vocabs anyway
+  teacher-ensemble: 1
+
+  #URL to the OPUS-MT model to use as the teacher
+  opusmt-teacher: "https://object.pouta.csc.fi/Tatoeba-MT-models/cat-eng/opus+bt-2021-04-30.zip"
+
+  #URL to the OPUS-MT model to use as the backward model
+  opusmt-backward: "https://object.pouta.csc.fi/Tatoeba-MT-models/eng-cat/opus+bt-2021-04-10.zip"
+
+  #Only specify this if model is target-multilingual
+  target-language-token: 
+
+  # path to a pretrained backward model (optional)  
+  backward-model: ""
+
+  # limits per downloaded dataset
+  mono-max-sentences-src: 100000000
+  mono-max-sentences-trg: 20000000
+
+  spm-sample-size: 2000000
+
+  # split corpus to parallelize translation
+  split-length: 2000000
+
+  best-model: perplexity
+  bicleaner:
+    default-threshold: 0.5
+    dataset-thresholds:
+      tc_Tatoeba-train-v2021-08-07.cat.eng: 0.5
+
+#TODO: extract this info straight from the OPUS model yml info file
+datasets:
+  # parallel training corpus
+  train:
+    - tc_Tatoeba-Challenge-v2021-08-07
+  # datasets to merge for validation while training. Translations by teacher
+  devtest:
+    - flores_dev
+  # datasets for evaluation
+  test:
+    - flores_devtest  
+  mono-src:
+    - custom-corpus/catext
+    - custom-corpus/macocu
+
+marian-args:
+  decoding-teacher: #added to reduce time
+    mini-batch: 64
+    beam-size: 4 
diff --git a/cat-eng/students/tiny/model_config.yml b/cat-eng/students/tiny/model_config.yml
@@ -0,0 +1,25 @@
+# https://github.com/browsermt/students/tree/master/train-student/models/student.tiny11
+dec-cell-base-depth: 2
+dec-cell-high-depth: 1
+dec-cell: ssru
+dec-depth: 2
+dim-emb: 256
+dim-vocabs: [32000, 32000]
+enc-cell-depth: 1
+enc-cell: gru
+enc-depth: 6
+enc-type: bidirectional
+tied-embeddings-all: true
+transformer-decoder-autoreg: rnn
+transformer-dim-ffn: 1536
+transformer-ffn-activation: relu
+transformer-ffn-depth: 2
+transformer-guided-alignment-layer: last
+transformer-heads: 8
+transformer-no-projection: false
+transformer-postprocess-emb: d
+transformer-postprocess: dan
+transformer-preprocess: ""
+transformer-tied-layers: []
+transformer-train-position-embeddings: false
+type: transformer