From 798eaa531902ac8a5dfb1b555e9bdbf98af44d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Thu, 21 Oct 2021 13:18:04 +0300 Subject: [PATCH 01/13] Add scripts for regenerating the expected outputs for adaptive tests --- tests/_self-adaptive/gen-costs.py | 176 ++++++++++++++++++ .../regenerate-expected-outputs.sh | 35 ++++ 2 files changed, 211 insertions(+) create mode 100755 tests/_self-adaptive/gen-costs.py create mode 100755 tests/_self-adaptive/regenerate-expected-outputs.sh diff --git a/tests/_self-adaptive/gen-costs.py b/tests/_self-adaptive/gen-costs.py new file mode 100755 index 00000000..afcf993b --- /dev/null +++ b/tests/_self-adaptive/gen-costs.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 + +import argparse as ag +import os +import sys +import glob +import subprocess as sp +import re +import tempfile +import shutil +import dataclasses +from dataclasses import dataclass + +@dataclass +class MarianTrainConfig: + marian_dir: str + model: str + vocab1: str + vocab2: str + dim_vocab1: int + dim_vocab2: int + epochs: int + + +def main(): + parser = ag.ArgumentParser() + parser.add_argument('-t', '--train-sets', + type=ag.FileType('r', encoding='utf-8'), nargs=2, required=True) + parser.add_argument('-i', '--input', type=ag.FileType('r', encoding='utf-8'), required=True) + parser.add_argument('-m', '--model', required=True) + parser.add_argument('-v', '--vocabs', nargs=2, required=True) + parser.add_argument('-e', '--epochs', type=int, required=True) + parser.add_argument('--marian-dir', required=True) + parser.add_argument('--output-costs', type=ag.FileType('w', encoding='utf-8')) + parser.add_argument('--output-transl', type=ag.FileType('w', encoding='utf-8')) + + args = parser.parse_args() + + [sfile, tfile] = args.train_sets + [vocab1, vocab2] = args.vocabs + config = MarianTrainConfig(args.marian_dir, args.model, vocab1, vocab2, 85000, 85000, args.epochs) + costs_and_translations = iterate_over_inputs(config, sfile, tfile, args.input) + output_costs_and_translations(costs_and_translations, args.output_costs, args.output_transl) + + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + +def iterate_over_inputs(config, source, target, inputs): + all_costs_and_translations = [] + files_removed = True + has_context = False + try: + for sline, tline in zip(source, target): + if files_removed: + sfile = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False) + tfile = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False) + eprint(f"Created temp files for training data: {sfile.name}, {tfile.name}") + files_removed = False + has_context = False + + if sline != "\n" or tline != "\n": + eprint("NOOOOOOOO") + eprint(sline) + eprint(tline) + sfile.write(sline) + tfile.write(tline) + has_context = True + else: + sfile.close() + tfile.close() + eprint("AAAAAA") + input_line = inputs.readline() + if has_context: + costs_and_translations = run_marian(sfile.name, tfile.name, input_line, config) + else: + eprint("No context provided, skipping training") + translations = translate_marian(input_line, config) + costs_and_translations = [([], t) for t in translations] + all_costs_and_translations.append(costs_and_translations) + os.remove(sfile.name) + os.remove(tfile.name) + files_removed = True + # If the files didn't end with a newline, marian wasn't run for the last set of sentences + if not files_removed: + sfile.close() + tfile.close() + eprint("AAAAAA") + input_line = inputs.readline() + if has_context: + costs_and_translations = run_marian(sfile.name, tfile.name, input_line, config) + else: + eprint("No context provided, skipping training") + translations = translate_marian(input_line, config) + costs_and_translations = [([], t) for t in translations] + all_costs_and_translations.append(costs_and_translations) + os.remove(sfile.name) + os.remove(tfile.name) + finally: + for f in [sfile, tfile]: + if f is not None and os.path.exists(f.name): + if not f.closed: + f.close() + os.remove(f.name) + eprint(f"ERROR: Needed cleanup for {f.name}") + return all_costs_and_translations + +def run_marian(sfile, tfile, input_line, config): + c = config + temp_model_path = create_temp_model_copy(c.model) + new_config = dataclasses.replace(c, model=temp_model_path) + costs = train_marian(sfile, tfile, new_config) + for path in glob.glob(f"{temp_model_path}*"): + eprint(f"Removing model file: {path}") + os.remove(path) + translations = translate_marian(input_line, config) + return (costs, translations) + +def create_temp_model_copy(model): + fd, path = tempfile.mkstemp(suffix='.npz') + eprint(f"Created temp file for model: {path}") + os.close(fd) + shutil.copyfile(model, path) + return path + +def train_marian(sfile, tfile, config): + c = config + process = sp.run([f"{c.marian_dir}/marian", '-m', c.model, '--disp-freq', '1', '--type', 'amun', '-v', + c.vocab1, '-v', c.vocab2, '--dim-vocabs', str(c.dim_vocab1), str(c.dim_vocab2), '--dim-emb', '500', + '--after-epochs', str(c.epochs), '--mini-batch', '1', '-t', sfile, tfile], capture_output=True, text=True) + eprint("STDOUT:") + eprint(process.stdout) + eprint("STDERR:") + eprint(process.stderr) + costs = extract_costs(process.stderr) + eprint("COSTS:") + eprint(costs) + return costs + +def extract_costs(output_log): + p = re.compile('Ep\..* Cost ([-e0-9.]+) .*: Time') + costs = [] + for line in output_log.splitlines(): + m = p.search(line) + if m is not None: + costs.append(m.group(1)) + return costs + +def translate_marian(input_line, config): + c = config + process = sp.run([f"{c.marian_dir}/marian-decoder", '-m', c.model, + '--type', 'amun', '-v', c.vocab1, '-v', c.vocab2, + '--dim-vocabs', str(c.dim_vocab1), str(c.dim_vocab2), + '--dim-emb', '500'], input=input_line, capture_output=True, text=True) + eprint("STDOUT:") + eprint(process.stdout) + eprint("STDERR:") + eprint(process.stderr) + translations = process.stdout.splitlines() + eprint(translations) + return translations + +def output_costs_and_translations(costs_and_translations, output_costs, output_transl): + eprint("COSTS AND TRANSLATIONS:") + eprint(costs_and_translations) + + if output_costs is not None: + all_costs = [cost for costs, _ in costs_and_translations for cost in costs] + output_costs.write('\n'.join(all_costs)) + if output_transl is not None: + all_translations = [translation for _, translations in costs_and_translations for translation in translations] + output_transl.write('\n'.join(all_translations)) + + +if __name__ == "__main__": + main() diff --git a/tests/_self-adaptive/regenerate-expected-outputs.sh b/tests/_self-adaptive/regenerate-expected-outputs.sh new file mode 100755 index 00000000..de0753e1 --- /dev/null +++ b/tests/_self-adaptive/regenerate-expected-outputs.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -euo pipefail + +MRT_MODELS=../../models +MRT_TOOLS=../../tools + +MODELS=$MRT_MODELS/wmt16_systems/en-de + +./gen-costs.py \ + -t ubuntu.oracle_2s1e.{src,ref} \ + -m $MODELS/model.npz \ + -v $MODELS/vocab.{en,de}.json \ + -e 1 \ + --marian-dir ~/prog/cpp/marian-adaptive/build/ \ + -i ubuntu.src \ + --output-costs costs.expected \ + --output-transl oracle.expected + +# Generate BLEU +$MRT_TOOLS/moses-scripts/scripts/generic/multi-bleu.perl -lc ubuntu.ref < oracle.expected > oracle.bleu.expected + + + +./gen-costs.py \ + -t ubuntu.oracle_2s1e.{src,ref} \ + -m $MODELS/model.npz \ + -v $MODELS/vocab.{en,de}.json \ + -e 1 \ + --marian-dir ~/prog/cpp/marian-adaptive/build/ \ + -i ubuntu.src \ + --output-costs costs.expected \ + --output-transl oracle.expected + +# Generate BLEU +$MRT_TOOLS/moses-scripts/scripts/generic/multi-bleu.perl -lc ubuntu.ref < oracle.expected2 > oracle.bleu.expected2 From edb6522697895637cb1b1a929de5ca7b5b14eebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Thu, 21 Oct 2021 14:20:37 +0300 Subject: [PATCH 02/13] Factor out training file generation into a generator --- tests/_self-adaptive/gen-costs.py | 73 ++++++++++++++----------------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/tests/_self-adaptive/gen-costs.py b/tests/_self-adaptive/gen-costs.py index afcf993b..54a09130 100755 --- a/tests/_self-adaptive/gen-costs.py +++ b/tests/_self-adaptive/gen-costs.py @@ -46,48 +46,43 @@ def main(): def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) -def iterate_over_inputs(config, source, target, inputs): - all_costs_and_translations = [] - files_removed = True - has_context = False - try: - for sline, tline in zip(source, target): - if files_removed: - sfile = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False) - tfile = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False) - eprint(f"Created temp files for training data: {sfile.name}, {tfile.name}") - files_removed = False - has_context = False - - if sline != "\n" or tline != "\n": - eprint("NOOOOOOOO") - eprint(sline) - eprint(tline) - sfile.write(sline) - tfile.write(tline) - has_context = True - else: - sfile.close() - tfile.close() - eprint("AAAAAA") - input_line = inputs.readline() - if has_context: - costs_and_translations = run_marian(sfile.name, tfile.name, input_line, config) - else: - eprint("No context provided, skipping training") - translations = translate_marian(input_line, config) - costs_and_translations = [([], t) for t in translations] - all_costs_and_translations.append(costs_and_translations) - os.remove(sfile.name) - os.remove(tfile.name) - files_removed = True - # If the files didn't end with a newline, marian wasn't run for the last set of sentences - if not files_removed: +def training_file_generator(source, target): + begin_sentences = True + contains_sentences = False + for sline, tline in zip(source, target): + if begin_sentences: + sfile = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False) + tfile = tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False) + eprint(f"Created temp files for training data: {sfile.name}, {tfile.name}") + begin_sentences = False + contains_sentences = False + + if sline != "\n" or tline != "\n": + eprint("NOOOOOOOO") + eprint(sline) + eprint(tline) + sfile.write(sline) + tfile.write(tline) + contains_sentences = True + else: sfile.close() tfile.close() eprint("AAAAAA") - input_line = inputs.readline() - if has_context: + yield (contains_sentences, sfile, tfile) + begin_sentences = True + + # The last non-empty set of sentences can not be delimited with an empty line + if contains_sentences: + sfile.close() + tfile.close() + eprint("AAAAAA") + yield (contains_sentences, sfile, tfile) + +def iterate_over_inputs(config, source, target, inputs): + all_costs_and_translations = [] + try: + for input_line, (contains_sentences, sfile, tfile) in zip(inputs, training_file_generator(source, target)): + if contains_sentences: costs_and_translations = run_marian(sfile.name, tfile.name, input_line, config) else: eprint("No context provided, skipping training") From b098f9e4a5d9b61b82c68b326b888cfc0073ce4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Thu, 21 Oct 2021 15:53:06 +0300 Subject: [PATCH 03/13] Fix empty cost list insertion when not training --- tests/_self-adaptive/gen-costs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/_self-adaptive/gen-costs.py b/tests/_self-adaptive/gen-costs.py index 54a09130..91b209e9 100755 --- a/tests/_self-adaptive/gen-costs.py +++ b/tests/_self-adaptive/gen-costs.py @@ -87,7 +87,7 @@ def iterate_over_inputs(config, source, target, inputs): else: eprint("No context provided, skipping training") translations = translate_marian(input_line, config) - costs_and_translations = [([], t) for t in translations] + costs_and_translations = ([], translations) all_costs_and_translations.append(costs_and_translations) os.remove(sfile.name) os.remove(tfile.name) From 97d59f2d3cfd850db868682b85e8a0e4b4cc8982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Thu, 21 Oct 2021 15:53:47 +0300 Subject: [PATCH 04/13] Generate outputs for the other test kinds --- .../regenerate-expected-outputs.sh | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/_self-adaptive/regenerate-expected-outputs.sh b/tests/_self-adaptive/regenerate-expected-outputs.sh index de0753e1..ca48c0a6 100755 --- a/tests/_self-adaptive/regenerate-expected-outputs.sh +++ b/tests/_self-adaptive/regenerate-expected-outputs.sh @@ -6,6 +6,7 @@ MRT_TOOLS=../../tools MODELS=$MRT_MODELS/wmt16_systems/en-de +echo "### Generating files for the oracle tests" ./gen-costs.py \ -t ubuntu.oracle_2s1e.{src,ref} \ -m $MODELS/model.npz \ @@ -19,17 +20,24 @@ MODELS=$MRT_MODELS/wmt16_systems/en-de # Generate BLEU $MRT_TOOLS/moses-scripts/scripts/generic/multi-bleu.perl -lc ubuntu.ref < oracle.expected > oracle.bleu.expected - - +echo "\n\n### Generating files for the partial context tests" ./gen-costs.py \ - -t ubuntu.oracle_2s1e.{src,ref} \ + -t ubuntu.contextpart.{src,ref} \ -m $MODELS/model.npz \ -v $MODELS/vocab.{en,de}.json \ -e 1 \ --marian-dir ~/prog/cpp/marian-adaptive/build/ \ -i ubuntu.src \ - --output-costs costs.expected \ - --output-transl oracle.expected + --output-costs contextpart.costs.expected \ + --output-transl contextpart.expected -# Generate BLEU -$MRT_TOOLS/moses-scripts/scripts/generic/multi-bleu.perl -lc ubuntu.ref < oracle.expected2 > oracle.bleu.expected2 + +echo "\n\n### Generating files for the no context tests" +./gen-costs.py \ + -t ubuntu.nocontext.{src,ref} \ + -m $MODELS/model.npz \ + -v $MODELS/vocab.{en,de}.json \ + -e 1 \ + --marian-dir ~/prog/cpp/marian-adaptive/build/ \ + -i ubuntu.nocontext.src \ + --output-transl nocontext.expected From 6c90942b8b7b63725ef2b20adc0797fe05bba614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Fri, 22 Oct 2021 15:15:52 +0300 Subject: [PATCH 05/13] Don't swallow the last newline for translations; fix no context tests --- tests/_self-adaptive/gen-costs.py | 5 +++-- tests/_self-adaptive/regenerate-expected-outputs.sh | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/_self-adaptive/gen-costs.py b/tests/_self-adaptive/gen-costs.py index 91b209e9..22f56a1d 100755 --- a/tests/_self-adaptive/gen-costs.py +++ b/tests/_self-adaptive/gen-costs.py @@ -147,6 +147,7 @@ def translate_marian(input_line, config): '--type', 'amun', '-v', c.vocab1, '-v', c.vocab2, '--dim-vocabs', str(c.dim_vocab1), str(c.dim_vocab2), '--dim-emb', '500'], input=input_line, capture_output=True, text=True) + eprint(f"Translate input: {input_line}") eprint("STDOUT:") eprint(process.stdout) eprint("STDERR:") @@ -161,10 +162,10 @@ def output_costs_and_translations(costs_and_translations, output_costs, output_t if output_costs is not None: all_costs = [cost for costs, _ in costs_and_translations for cost in costs] - output_costs.write('\n'.join(all_costs)) + output_costs.writelines(map(lambda c: c + '\n', all_costs)) if output_transl is not None: all_translations = [translation for _, translations in costs_and_translations for translation in translations] - output_transl.write('\n'.join(all_translations)) + output_transl.writelines(map(lambda t: t + '\n', all_translations)) if __name__ == "__main__": diff --git a/tests/_self-adaptive/regenerate-expected-outputs.sh b/tests/_self-adaptive/regenerate-expected-outputs.sh index ca48c0a6..4223490c 100755 --- a/tests/_self-adaptive/regenerate-expected-outputs.sh +++ b/tests/_self-adaptive/regenerate-expected-outputs.sh @@ -20,7 +20,7 @@ echo "### Generating files for the oracle tests" # Generate BLEU $MRT_TOOLS/moses-scripts/scripts/generic/multi-bleu.perl -lc ubuntu.ref < oracle.expected > oracle.bleu.expected -echo "\n\n### Generating files for the partial context tests" +echo -e "\n\n### Generating files for the partial context tests" ./gen-costs.py \ -t ubuntu.contextpart.{src,ref} \ -m $MODELS/model.npz \ @@ -32,12 +32,12 @@ echo "\n\n### Generating files for the partial context tests" --output-transl contextpart.expected -echo "\n\n### Generating files for the no context tests" +echo -e "\n\n### Generating files for the no context tests" ./gen-costs.py \ -t ubuntu.nocontext.{src,ref} \ -m $MODELS/model.npz \ -v $MODELS/vocab.{en,de}.json \ -e 1 \ --marian-dir ~/prog/cpp/marian-adaptive/build/ \ - -i ubuntu.nocontext.src \ + -i ubuntu.src \ --output-transl nocontext.expected From 2efa3bc9dd9929bbfee5c9b2934d900fd365b979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Fri, 22 Oct 2021 15:48:36 +0300 Subject: [PATCH 06/13] Change the optimizer to sgd to match adaptive marian's default --- tests/_self-adaptive/gen-costs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/_self-adaptive/gen-costs.py b/tests/_self-adaptive/gen-costs.py index 22f56a1d..246f5dfd 100755 --- a/tests/_self-adaptive/gen-costs.py +++ b/tests/_self-adaptive/gen-costs.py @@ -120,7 +120,7 @@ def create_temp_model_copy(model): def train_marian(sfile, tfile, config): c = config - process = sp.run([f"{c.marian_dir}/marian", '-m', c.model, '--disp-freq', '1', '--type', 'amun', '-v', + process = sp.run([f"{c.marian_dir}/marian", '-m', c.model, '--disp-freq', '1', '--optimizer', 'sgd', '--type', 'amun', '-v', c.vocab1, '-v', c.vocab2, '--dim-vocabs', str(c.dim_vocab1), str(c.dim_vocab2), '--dim-emb', '500', '--after-epochs', str(c.epochs), '--mini-batch', '1', '-t', sfile, tfile], capture_output=True, text=True) eprint("STDOUT:") From ded1fb7fb018a7fc361ae7702a8c5bbee84fee3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Fri, 22 Oct 2021 15:52:06 +0300 Subject: [PATCH 07/13] Update the *.exptected files It's weird that the BLEU scores drop that much. Don't know why that's happening and whether that's a problem --- .../_self-adaptive/contextpart.costs.expected | 24 +++++----- tests/_self-adaptive/contextpart.expected | 10 ++--- tests/_self-adaptive/costs.expected | 44 +++++++++---------- tests/_self-adaptive/oracle.bleu.expected | 2 +- tests/_self-adaptive/oracle.expected | 18 ++++---- 5 files changed, 49 insertions(+), 49 deletions(-) diff --git a/tests/_self-adaptive/contextpart.costs.expected b/tests/_self-adaptive/contextpart.costs.expected index 67c51dcc..30a7cedb 100644 --- a/tests/_self-adaptive/contextpart.costs.expected +++ b/tests/_self-adaptive/contextpart.costs.expected @@ -1,12 +1,12 @@ -22.19 -2.75 -6.56 -0.29 -47.07 -15.31 -4.52 -0.11 -24.44 -2.20 -28.06 -6.56 +2.21881247 +2.21806955 +0.46859190 +0.46825695 +2.76864076 +2.76791906 +0.56501639 +0.56451356 +2.71565461 +2.71488905 +3.11743832 +3.11609936 diff --git a/tests/_self-adaptive/contextpart.expected b/tests/_self-adaptive/contextpart.expected index 4a222e8b..5d2b071d 100644 --- a/tests/_self-adaptive/contextpart.expected +++ b/tests/_self-adaptive/contextpart.expected @@ -1,11 +1,11 @@ klicken und ziehen -die linke Maustaste für einen Rechtsklick gedrückt halten . -Sie können rechts@@ klicken , indem Sie die linke Maustaste gedrückt halten . +drücken Sie die linke Maustaste und klicken Sie mit der rechten Maustaste . +Sie können mit Rechtsklick die linke Maustaste gedrückt halten . wechseln Sie &@@ lt@@ ; gu@@ i > Sim@@ ulated Secondary Click &@@ lt@@ ; / gu@@ i > weiter . -warum sollte ich meine E-Mail-@@ Konten oder sozialen Medien zu Ihrem Desktop hinzufügen ? +warum fügen Sie Ihre E-Mail oder Social Media auf Ihren Desktop ? warum sollte ich ein Konto hinzufügen ? -Anwendungen entfernen , die Sie nicht mehr benötigen +entfernen Sie die Software , die Sie nicht mehr verwenden . prüfen Sie Ihre Sicherung die von Ihnen verwendete Software kann in der Regel recht schnell wieder hergestellt werden . zurück eure wichtigen Akten -was ist die " Super " -Taste ? +was ist der " Super " -@@ Schlüssel ? diff --git a/tests/_self-adaptive/costs.expected b/tests/_self-adaptive/costs.expected index 1c8cb87c..326568ae 100644 --- a/tests/_self-adaptive/costs.expected +++ b/tests/_self-adaptive/costs.expected @@ -1,22 +1,22 @@ -1.26 -0.01 -22.19 -2.75 -6.56 -0.29 -77.35 -52.49 -47.07 -15.31 -4.52 -0.11 -24.44 -2.20 -7.48 -0.13 -38.95 -8.38 -13.98 -0.36 -28.06 -6.56 +0.31558657 +0.31486344 +2.21881247 +2.21806955 +0.46859190 +0.46825695 +4.07114267 +4.07016516 +2.76864076 +2.76791906 +0.56501639 +0.56451356 +2.71565461 +2.71488905 +1.87033248 +1.86898255 +1.77039230 +1.76994097 +2.79638195 +2.79453039 +3.11743832 +3.11609936 diff --git a/tests/_self-adaptive/oracle.bleu.expected b/tests/_self-adaptive/oracle.bleu.expected index 104e7681..71a2740c 100644 --- a/tests/_self-adaptive/oracle.bleu.expected +++ b/tests/_self-adaptive/oracle.bleu.expected @@ -1 +1 @@ -BLEU = 79.14, 96.0/90.9/85.7/81.8 (BP=0.895, ratio=0.900, hyp_len=99, ref_len=110) +BLEU = 28.81, 51.4/33.7/24.1/17.1 (BP=0.991, ratio=0.991, hyp_len=109, ref_len=110) diff --git a/tests/_self-adaptive/oracle.expected b/tests/_self-adaptive/oracle.expected index 8ca6bdbc..5d2b071d 100644 --- a/tests/_self-adaptive/oracle.expected +++ b/tests/_self-adaptive/oracle.expected @@ -1,11 +1,11 @@ klicken und ziehen -die linke Maustaste für einen Rechtsklick gedrückt halten . -Sie können rechts@@ klicken , indem Sie die linke Maustaste gedrückt halten . -aktivieren Sie &@@ lt@@ ; gu@@ i > . -warum sollte ich meine E-Mail-@@ Konten oder sozialen Medien zu Ihrem Desktop hinzufügen ? +drücken Sie die linke Maustaste und klicken Sie mit der rechten Maustaste . +Sie können mit Rechtsklick die linke Maustaste gedrückt halten . +wechseln Sie &@@ lt@@ ; gu@@ i > Sim@@ ulated Secondary Click &@@ lt@@ ; / gu@@ i > weiter . +warum fügen Sie Ihre E-Mail oder Social Media auf Ihren Desktop ? warum sollte ich ein Konto hinzufügen ? -Anwendungen entfernen , die Sie nicht mehr benötigen -ihre Sicherung überprüfen -die Anwendungen , die Sie nutzen , können durch Neu@@ installation nach einem schwerwiegenden Rechner@@ problem meist schnell wiederhergestellt werden . -sichern Ihrer wichtigen Dateien -was ist die " Super " -Taste ? +entfernen Sie die Software , die Sie nicht mehr verwenden . +prüfen Sie Ihre Sicherung +die von Ihnen verwendete Software kann in der Regel recht schnell wieder hergestellt werden . +zurück eure wichtigen Akten +was ist der " Super " -@@ Schlüssel ? From b20cbfd599a88f70b2f9dd717ebb44c01575f44b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Tue, 26 Oct 2021 15:56:44 +0300 Subject: [PATCH 08/13] Update server test expected outputs --- tests/_self-adaptive/test_server_mode.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/_self-adaptive/test_server_mode.sh b/tests/_self-adaptive/test_server_mode.sh index 3c79f62e..4a756619 100644 --- a/tests/_self-adaptive/test_server_mode.sh +++ b/tests/_self-adaptive/test_server_mode.sh @@ -19,11 +19,11 @@ kill $SERVER_PID test -e server.log grep -q "listening on port 8766" server.log -grep -q '{"output":"dies ist ein Beispiel' server.log +grep -q '{"output":"das ist ein Beispiel' server.log grep -q "Ep. 2 : Up. 4 : Sen. 2" server.log grep -q "Ep. 2 : Up. 2 : Sen. 1" server.log grep -q "No context" server.log -grep -q 'dies ist ein Beispiel' text.out +grep -q 'das ist ein Beispiel' text.out # Exit with success code exit 0 From e92dafc31dc591b2ff788b33a5fad5670c73d829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Mon, 8 Nov 2021 14:08:55 +0200 Subject: [PATCH 09/13] Make debug outputs saner --- tests/_self-adaptive/gen-costs.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/_self-adaptive/gen-costs.py b/tests/_self-adaptive/gen-costs.py index 246f5dfd..9a8ac77f 100755 --- a/tests/_self-adaptive/gen-costs.py +++ b/tests/_self-adaptive/gen-costs.py @@ -58,16 +58,14 @@ def training_file_generator(source, target): contains_sentences = False if sline != "\n" or tline != "\n": - eprint("NOOOOOOOO") - eprint(sline) - eprint(tline) + eprint(sline.rstrip()) + eprint(tline.rstrip()) sfile.write(sline) tfile.write(tline) contains_sentences = True else: sfile.close() tfile.close() - eprint("AAAAAA") yield (contains_sentences, sfile, tfile) begin_sentences = True @@ -75,7 +73,6 @@ def training_file_generator(source, target): if contains_sentences: sfile.close() tfile.close() - eprint("AAAAAA") yield (contains_sentences, sfile, tfile) def iterate_over_inputs(config, source, target, inputs): From 6e0f4aac9f9c3abe866ce27a812e89ccc772f7ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Mon, 8 Nov 2021 15:53:55 +0200 Subject: [PATCH 10/13] Make dimensions and model type configureable --- tests/_self-adaptive/gen-costs.py | 41 ++++++++++++++----- .../regenerate-expected-outputs.sh | 9 ++++ 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/tests/_self-adaptive/gen-costs.py b/tests/_self-adaptive/gen-costs.py index 9a8ac77f..0605e544 100755 --- a/tests/_self-adaptive/gen-costs.py +++ b/tests/_self-adaptive/gen-costs.py @@ -15,13 +15,18 @@ class MarianTrainConfig: marian_dir: str model: str + model_type: str vocab1: str vocab2: str dim_vocab1: int dim_vocab2: int + dim_emb: int epochs: int +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + def main(): parser = ag.ArgumentParser() parser.add_argument('-t', '--train-sets', @@ -29,7 +34,10 @@ def main(): parser.add_argument('-i', '--input', type=ag.FileType('r', encoding='utf-8'), required=True) parser.add_argument('-m', '--model', required=True) parser.add_argument('-v', '--vocabs', nargs=2, required=True) + parser.add_argument('--dim-vocabs', nargs=2, type=int, required=False) + parser.add_argument('--dim-emb', type=int, required=False) parser.add_argument('-e', '--epochs', type=int, required=True) + parser.add_argument('--type', required=True) parser.add_argument('--marian-dir', required=True) parser.add_argument('--output-costs', type=ag.FileType('w', encoding='utf-8')) parser.add_argument('--output-transl', type=ag.FileType('w', encoding='utf-8')) @@ -38,14 +46,14 @@ def main(): [sfile, tfile] = args.train_sets [vocab1, vocab2] = args.vocabs - config = MarianTrainConfig(args.marian_dir, args.model, vocab1, vocab2, 85000, 85000, args.epochs) + [dvocab1, dvocab2] = args.dim_vocabs if args.dim_vocabs is not None else [None, None] + config = MarianTrainConfig(args.marian_dir, args.model, args.type, vocab1, + vocab2, dvocab1, dvocab1, args.dim_emb, args.epochs) + eprint(config) costs_and_translations = iterate_over_inputs(config, sfile, tfile, args.input) output_costs_and_translations(costs_and_translations, args.output_costs, args.output_transl) -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - def training_file_generator(source, target): begin_sentences = True contains_sentences = False @@ -117,9 +125,15 @@ def create_temp_model_copy(model): def train_marian(sfile, tfile, config): c = config - process = sp.run([f"{c.marian_dir}/marian", '-m', c.model, '--disp-freq', '1', '--optimizer', 'sgd', '--type', 'amun', '-v', - c.vocab1, '-v', c.vocab2, '--dim-vocabs', str(c.dim_vocab1), str(c.dim_vocab2), '--dim-emb', '500', - '--after-epochs', str(c.epochs), '--mini-batch', '1', '-t', sfile, tfile], capture_output=True, text=True) + + args = [f"{c.marian_dir}/marian", '-m', c.model, '--disp-freq', '1', '--optimizer', 'sgd', '--type', c.model_type, '-v', + c.vocab1, '-v', c.vocab2, '--after-epochs', str(c.epochs), '--mini-batch', '1', '-t', sfile, tfile] + if c.dim_emb is not None: + args += ['--dim-emb', str(c.dim_emb)] + if c.dim_vocab1 is not None and c.dim_vocab2 is not None: + args += ['--dim-vocabs', str(c.dim_vocab1), str(c.dim_vocab2)] + process = sp.run(args, capture_output=True, text=True) + eprint("STDOUT:") eprint(process.stdout) eprint("STDERR:") @@ -140,10 +154,15 @@ def extract_costs(output_log): def translate_marian(input_line, config): c = config - process = sp.run([f"{c.marian_dir}/marian-decoder", '-m', c.model, - '--type', 'amun', '-v', c.vocab1, '-v', c.vocab2, - '--dim-vocabs', str(c.dim_vocab1), str(c.dim_vocab2), - '--dim-emb', '500'], input=input_line, capture_output=True, text=True) + + args = [f"{c.marian_dir}/marian-decoder", '-m', c.model, + '--type', c.model_type, '-v', c.vocab1, '-v', c.vocab2] + if c.dim_emb is not None: + args += ['--dim-emb', str(c.dim_emb)] + if c.dim_vocab1 is not None and c.dim_vocab2 is not None: + args += ['--dim-vocabs', str(c.dim_vocab1), str(c.dim_vocab2)] + process = sp.run(args, input=input_line, capture_output=True, text=True) + eprint(f"Translate input: {input_line}") eprint("STDOUT:") eprint(process.stdout) diff --git a/tests/_self-adaptive/regenerate-expected-outputs.sh b/tests/_self-adaptive/regenerate-expected-outputs.sh index 4223490c..362a74de 100755 --- a/tests/_self-adaptive/regenerate-expected-outputs.sh +++ b/tests/_self-adaptive/regenerate-expected-outputs.sh @@ -10,6 +10,9 @@ echo "### Generating files for the oracle tests" ./gen-costs.py \ -t ubuntu.oracle_2s1e.{src,ref} \ -m $MODELS/model.npz \ + --type amun \ + --dim-vocabs 85000 85000 \ + --dim-emb 500 \ -v $MODELS/vocab.{en,de}.json \ -e 1 \ --marian-dir ~/prog/cpp/marian-adaptive/build/ \ @@ -24,6 +27,9 @@ echo -e "\n\n### Generating files for the partial context tests" ./gen-costs.py \ -t ubuntu.contextpart.{src,ref} \ -m $MODELS/model.npz \ + --type amun \ + --dim-vocabs 85000 85000 \ + --dim-emb 500 \ -v $MODELS/vocab.{en,de}.json \ -e 1 \ --marian-dir ~/prog/cpp/marian-adaptive/build/ \ @@ -36,6 +42,9 @@ echo -e "\n\n### Generating files for the no context tests" ./gen-costs.py \ -t ubuntu.nocontext.{src,ref} \ -m $MODELS/model.npz \ + --type amun \ + --dim-vocabs 85000 85000 \ + --dim-emb 500 \ -v $MODELS/vocab.{en,de}.json \ -e 1 \ --marian-dir ~/prog/cpp/marian-adaptive/build/ \ From a8bdf85b6d0edfcec56b6de85025e65db2d909ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Mon, 8 Nov 2021 16:22:24 +0200 Subject: [PATCH 11/13] Add a self-adaptive transformer test --- .../regenerate-expected-outputs.sh | 13 ++++++++++ .../test_context_partial_transformer.sh | 24 +++++++++++++++++++ .../transformer.contextpart.costs.expected | 12 ++++++++++ .../transformer.contextpart.expected | 11 +++++++++ 4 files changed, 60 insertions(+) create mode 100644 tests/_self-adaptive/test_context_partial_transformer.sh create mode 100644 tests/_self-adaptive/transformer.contextpart.costs.expected create mode 100644 tests/_self-adaptive/transformer.contextpart.expected diff --git a/tests/_self-adaptive/regenerate-expected-outputs.sh b/tests/_self-adaptive/regenerate-expected-outputs.sh index 362a74de..6b4b6dbc 100755 --- a/tests/_self-adaptive/regenerate-expected-outputs.sh +++ b/tests/_self-adaptive/regenerate-expected-outputs.sh @@ -50,3 +50,16 @@ echo -e "\n\n### Generating files for the no context tests" --marian-dir ~/prog/cpp/marian-adaptive/build/ \ -i ubuntu.src \ --output-transl nocontext.expected + + +echo -e "\n\n### Generating files for the transformer partial context tests" +./gen-costs.py \ + -t ubuntu.contextpart.{src,ref} \ + -m $MRT_MODELS/transformer/model.npz \ + --type transformer \ + -v $MRT_MODELS/transformer/vocab.ende.yml{,} \ + -e 1 \ + --marian-dir ~/prog/cpp/marian-adaptive/build/ \ + -i ubuntu.src \ + --output-costs transformer.contextpart.costs.expected \ + --output-transl transformer.contextpart.expected diff --git a/tests/_self-adaptive/test_context_partial_transformer.sh b/tests/_self-adaptive/test_context_partial_transformer.sh new file mode 100644 index 00000000..b3d76607 --- /dev/null +++ b/tests/_self-adaptive/test_context_partial_transformer.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Exit on error +set -e + +# Test code goes here +rm -f contextpart.log + +# Run Marian +$MRT_MARIAN/marian-adaptive \ + -m $MRT_MODELS/transformer/model.npz \ + -v $MRT_MODELS/transformer/vocab.ende.yml -v $MRT_MODELS/transformer/vocab.ende.yml \ + --after-epochs 1 \ + -t ubuntu.contextpart.src ubuntu.contextpart.ref --log contextpart.transformer.log < ubuntu.src > contextpart.transformer.out + +# Check outputs +$MRT_TOOLS/diff.sh contextpart.out contextpart.expected > contextpart.transformer.diff + +# Check costs +cat contextpart.log | $MRT_TOOLS/extract-costs.sh > contextpart.costs.transformer.out +$MRT_TOOLS/diff-nums.py -p 0.01 contextpart.costs.out contextpart.costs.expected -o contextpart.costs.transformer.diff + +# Exit with success code +exit 0 diff --git a/tests/_self-adaptive/transformer.contextpart.costs.expected b/tests/_self-adaptive/transformer.contextpart.costs.expected new file mode 100644 index 00000000..d2f8a912 --- /dev/null +++ b/tests/_self-adaptive/transformer.contextpart.costs.expected @@ -0,0 +1,12 @@ +5.41536570 +5.41065693 +4.29847670 +4.29496717 +4.33480740 +4.33143234 +0.43169200 +0.43012774 +2.70875025 +2.70587468 +3.08147693 +3.07468438 diff --git a/tests/_self-adaptive/transformer.contextpart.expected b/tests/_self-adaptive/transformer.contextpart.expected new file mode 100644 index 00000000..41aaf88a --- /dev/null +++ b/tests/_self-adaptive/transformer.contextpart.expected @@ -0,0 +1,11 @@ +Kli@@ cken und Japan@@ isch +drücken und halten Sie die lin@@ ke Maustaste auf ¥ . +Sie können die lin@@ ke Maustaste ge@@ drückt halten . +Sch@@ alter = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +warum Ihre E @-@ Mail oder Social Media Ac@@ counts zu Ihrem Desktop hinzufügen ? +warum sollte ich einen Account hinzufügen ? +entfernen Sie Software , die Sie nicht mehr verwenden . +überprüfen Sie Ihr Back@@ up +die von Ihnen verwendete Software kann normalerweise sehr schnell nach einem ernsten Computer@@ problem durch Neu@@ installation wieder@@ hergestellt werden . +sichern Sie Ihre wichtigen Dateien +was ist der chinesische Schlüssel ? From f95c51ce7aebbc438e0dbc6e2165c0822a840746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Thu, 16 Dec 2021 16:13:51 +0200 Subject: [PATCH 12/13] Fix self-adaptive server mode tests after changes in Marian --- tests/_self-adaptive/test_server_mode.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/_self-adaptive/test_server_mode.sh b/tests/_self-adaptive/test_server_mode.sh index 4a756619..68db8e37 100644 --- a/tests/_self-adaptive/test_server_mode.sh +++ b/tests/_self-adaptive/test_server_mode.sh @@ -9,7 +9,7 @@ clean_up() { trap clean_up EXIT # Test code goes here -$MRT_MARIAN/marian-adaptive -c $MRT_MODELS/wmt16_systems/marian.en-de.scorer.yml -p 8766 > server.log 2>&1 & +$MRT_MARIAN/marian-adaptive-server -c $MRT_MODELS/wmt16_systems/marian.en-de.scorer.yml -p 8766 > server.log 2>&1 & SERVER_PID=$! sleep 20 @@ -19,7 +19,7 @@ kill $SERVER_PID test -e server.log grep -q "listening on port 8766" server.log -grep -q '{"output":"das ist ein Beispiel' server.log +grep -q 'Best translation 0 : das ist ein Beispiel' server.log grep -q "Ep. 2 : Up. 4 : Sen. 2" server.log grep -q "Ep. 2 : Up. 2 : Sen. 1" server.log grep -q "No context" server.log From bc7efdcc35d5940da6915cacd13a1ba51906c839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= Date: Tue, 1 Feb 2022 11:00:16 +0200 Subject: [PATCH 13/13] Regenerate expected outputs for self-adaptive tests with a known config CMake command: cmake .. -DCMAKE_BUILD_TYPE=Debug -DCOMPILE_ADAPTIVE=ON -DCOMPILE_SERVER=ON GCC version: 7.5.0 CUDA version: 10.1 GPU model: Quadro RTX 6000 (Turing) --- .../_self-adaptive/contextpart.costs.expected | 16 +++++----- tests/_self-adaptive/costs.expected | 30 +++++++++---------- .../transformer.contextpart.costs.expected | 18 +++++------ 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/_self-adaptive/contextpart.costs.expected b/tests/_self-adaptive/contextpart.costs.expected index 30a7cedb..8c33473d 100644 --- a/tests/_self-adaptive/contextpart.costs.expected +++ b/tests/_self-adaptive/contextpart.costs.expected @@ -1,12 +1,12 @@ 2.21881247 2.21806955 -0.46859190 -0.46825695 -2.76864076 -2.76791906 -0.56501639 +0.46859169 +0.46825710 +2.76864004 +2.76791930 +0.56501710 0.56451356 -2.71565461 -2.71488905 -3.11743832 +2.71565413 +2.71488881 +3.11743879 3.11609936 diff --git a/tests/_self-adaptive/costs.expected b/tests/_self-adaptive/costs.expected index 326568ae..ac09ceab 100644 --- a/tests/_self-adaptive/costs.expected +++ b/tests/_self-adaptive/costs.expected @@ -1,22 +1,22 @@ -0.31558657 -0.31486344 +0.31558633 +0.31486320 2.21881247 2.21806955 -0.46859190 -0.46825695 +0.46859169 +0.46825710 4.07114267 -4.07016516 -2.76864076 -2.76791906 -0.56501639 +4.07016468 +2.76864004 +2.76791930 +0.56501710 0.56451356 -2.71565461 -2.71488905 -1.87033248 -1.86898255 +2.71565413 +2.71488881 +1.87033439 +1.86898220 1.77039230 1.76994097 -2.79638195 -2.79453039 -3.11743832 +2.79638267 +2.79453158 +3.11743879 3.11609936 diff --git a/tests/_self-adaptive/transformer.contextpart.costs.expected b/tests/_self-adaptive/transformer.contextpart.costs.expected index d2f8a912..eb54d76d 100644 --- a/tests/_self-adaptive/transformer.contextpart.costs.expected +++ b/tests/_self-adaptive/transformer.contextpart.costs.expected @@ -1,12 +1,12 @@ 5.41536570 -5.41065693 +5.41065788 4.29847670 4.29496717 -4.33480740 -4.33143234 -0.43169200 -0.43012774 -2.70875025 -2.70587468 -3.08147693 -3.07468438 +4.33481550 +4.33142281 +0.43169218 +0.43012768 +2.70875049 +2.70587540 +3.08147740 +3.07468462