Skip to content

Commit

Permalink
Add MT tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Oct 16, 2024
1 parent b7c06c8 commit 751a410
Show file tree
Hide file tree
Showing 12 changed files with 1,586 additions and 80 deletions.
17 changes: 4 additions & 13 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,15 @@
{
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.organizeImports": "explicit"
},
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.analysis.extraPaths": [
"tests"
],
"python.analysis.extraPaths": ["tests"],
"python.analysis.importFormat": "relative",
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"black-formatter.path": [
"poetry",
"run",
"black"
],
"python.analysis.extraPaths": [
"./tests"
]
}
"black-formatter.path": ["poetry", "run", "black"]
}
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ If you would like to find out more about how to use Machine, check out the tutor
- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
- [Machine Translation](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/machine_translation.ipynb)
14 changes: 13 additions & 1 deletion machine/jobs/huggingface/hugging_face_nmt_model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from pathlib import Path
from typing import Any, cast

import datasets.utils.logging as datasets_logging
import transformers.utils.logging as transformers_logging
from transformers import AutoConfig, AutoModelForSeq2SeqLM, HfArgumentParser, PreTrainedModel, Seq2SeqTrainingArguments
from transformers.integrations import ClearMLCallback
from transformers.tokenization_utils import TruncationStrategy
Expand Down Expand Up @@ -39,6 +41,16 @@ def __init__(self, config: Any) -> None:
):
self._training_args.report_to.remove("clearml")

# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers_logging.set_verbosity_info()

log_level = self._training_args.get_process_log_level()
logger.setLevel(log_level)
datasets_logging.set_verbosity(log_level)
transformers_logging.set_verbosity(log_level)
transformers_logging.enable_default_handler()
transformers_logging.enable_explicit_format()

@property
def train_tokenizer(self) -> bool:
return False
Expand Down Expand Up @@ -67,7 +79,7 @@ def create_model_trainer(self, corpus: ParallelTextCorpus) -> Trainer:
src_lang=self._config.src_lang,
tgt_lang=self._config.trg_lang,
add_unk_src_tokens=self._config.huggingface.tokenizer.add_unk_src_tokens,
add_unk_trg_tokens=self._config.huggingface.tokenizer.add_unk_trg_tokens,
add_unk_tgt_tokens=self._config.huggingface.tokenizer.add_unk_tgt_tokens,
)

def create_engine(self) -> TranslationEngine:
Expand Down
2 changes: 1 addition & 1 deletion machine/jobs/settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ default:
oom_batch_size_backoff_mult: 0.5
tokenizer:
add_unk_src_tokens: true
add_unk_trg_tokens: true
add_unk_tgt_tokens: true
thot_mt:
word_alignment_model_type: hmm
tokenizer: latin
Expand Down
2 changes: 1 addition & 1 deletion machine/translation/huggingface/hugging_face_nmt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(
self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
self._mpn = MosesPunctNormalizer()
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
else:
self._mpn = None

Expand Down
47 changes: 17 additions & 30 deletions machine/translation/huggingface/hugging_face_nmt_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from pathlib import Path
from typing import Any, Callable, List, Optional, Union, cast

import datasets.utils.logging as datasets_logging
import torch # pyright: ignore[reportMissingImports]
import transformers.utils.logging as transformers_logging
from datasets.arrow_dataset import Dataset
from sacremoses import MosesPunctNormalizer
from torch import Tensor # pyright: ignore[reportMissingImports]
Expand Down Expand Up @@ -84,10 +82,10 @@ def __init__(
corpus: Union[ParallelTextCorpus, Dataset],
src_lang: Optional[str] = None,
tgt_lang: Optional[str] = None,
max_source_length: Optional[int] = None,
max_target_length: Optional[int] = None,
max_src_length: Optional[int] = None,
max_tgt_length: Optional[int] = None,
add_unk_src_tokens: bool = False,
add_unk_trg_tokens: bool = True,
add_unk_tgt_tokens: bool = True,
) -> None:
self._model = model
self._training_args = training_args
Expand All @@ -96,12 +94,12 @@ def __init__(
self._tgt_lang = tgt_lang
self._trainer: Optional[Seq2SeqTrainer] = None
self._metrics = {}
self.max_source_length = max_source_length
self.max_target_length = max_target_length
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self._add_unk_src_tokens = add_unk_src_tokens
self._add_unk_trg_tokens = add_unk_trg_tokens
self._add_unk_tgt_tokens = add_unk_tgt_tokens
self._mpn = MosesPunctNormalizer()
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
self._stats = TrainStats()

@property
Expand All @@ -113,17 +111,6 @@ def train(
progress: Optional[Callable[[ProgressStatus], None]] = None,
check_canceled: Optional[Callable[[], None]] = None,
) -> None:
if self._training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers_logging.set_verbosity_info()

log_level = self._training_args.get_process_log_level()
logger.setLevel(log_level)
datasets_logging.set_verbosity(log_level)
transformers_logging.set_verbosity(log_level)
transformers_logging.enable_default_handler()
transformers_logging.enable_explicit_format()

last_checkpoint = None
if os.path.isdir(self._training_args.output_dir) and not self._training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(self._training_args.output_dir)
Expand Down Expand Up @@ -203,7 +190,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
logger.info(f"Added {len(missing_tokens)} tokens to the tokenizer: {missing_tokens}")
return AutoTokenizer.from_pretrained(str(tokenizer_dir), use_fast=True)

if self._add_unk_src_tokens or self._add_unk_trg_tokens:
if self._add_unk_src_tokens or self._add_unk_tgt_tokens:
logger.info("Checking for missing tokens")
if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
Expand All @@ -217,7 +204,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
)
# using unofficially supported behavior to set the normalizer
tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer # type: ignore
if self._add_unk_src_tokens and self._add_unk_trg_tokens:
if self._add_unk_src_tokens and self._add_unk_tgt_tokens:
lang_codes = [src_lang, tgt_lang]
elif self._add_unk_src_tokens:
lang_codes = [src_lang]
Expand Down Expand Up @@ -293,12 +280,12 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
if model.name_or_path.startswith("t5-") or model.name_or_path.startswith("google/mt5-"):
prefix = f"translate {self._src_lang} to {self._tgt_lang}: "

max_source_length = self.max_source_length
if max_source_length is None:
max_source_length = model.config.max_length
max_target_length = self.max_target_length
if max_target_length is None:
max_target_length = model.config.max_length
max_src_length = self.max_src_length
if max_src_length is None:
max_src_length = model.config.max_length
max_tgt_length = self.max_tgt_length
if max_tgt_length is None:
max_tgt_length = model.config.max_length

if self._training_args.label_smoothing_factor > 0 and not hasattr(
model, "prepare_decoder_input_ids_from_labels"
Expand All @@ -317,9 +304,9 @@ def preprocess_function(examples):
inputs = [prefix + ex[src_lang] for ex in examples["translation"]]
targets = [ex[tgt_lang] for ex in examples["translation"]]

model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True)
# Tokenize targets with the `text_target` keyword argument
labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
labels = tokenizer(text_target=targets, max_length=max_tgt_length, truncation=True)

model_inputs["labels"] = labels["input_ids"]
return model_inputs
Expand Down
14 changes: 13 additions & 1 deletion machine/translation/translation_suggester.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from abc import ABC, abstractmethod
from typing import Iterable, Sequence
from typing import Iterable, Optional, Sequence

from .interactive_translator import InteractiveTranslator
from .translation_result import TranslationResult
from .translation_suggestion import TranslationSuggestion
from .truecaser import Truecaser


class TranslationSuggester(ABC):
Expand All @@ -14,3 +16,13 @@ def __init__(self, confidence_threshold: float = 0, break_on_punctuation: bool =
def get_suggestions(
self, n: int, prefix_count: int, is_last_word_complete: bool, results: Iterable[TranslationResult]
) -> Sequence[TranslationSuggestion]: ...

def get_suggestions_from_translator(
self, n: int, translator: InteractiveTranslator, truecaser: Optional[Truecaser] = None
) -> Sequence[TranslationSuggestion]:
results = translator.get_current_results()
if truecaser is not None:
results = (
truecaser.truecase_translation_result(result, translator.target_detokenizer) for result in results
)
return self.get_suggestions(n, len(translator.prefix_word_ranges), translator.is_last_word_complete, results)
21 changes: 20 additions & 1 deletion machine/translation/truecaser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from abc import ABC, abstractmethod
from typing import Sequence
from typing import Optional, Sequence

from ..corpora.text_corpus import TextCorpus
from ..tokenization.detokenizer import Detokenizer
from ..tokenization.whitespace_detokenizer import WHITESPACE_DETOKENIZER
from .trainer import Trainer
from .translation_result import TranslationResult


class Truecaser(ABC):
Expand All @@ -15,5 +18,21 @@ def train_segment(self, segment: Sequence[str], sentence_start: bool = True) ->
@abstractmethod
def truecase(self, segment: Sequence[str]) -> Sequence[str]: ...

def truecase_translation_result(
self, result: TranslationResult, detokenizer: Optional[Detokenizer] = None
) -> TranslationResult:
if detokenizer is None:
detokenizer = WHITESPACE_DETOKENIZER
target_tokens = self.truecase(result.target_tokens)
return TranslationResult(
detokenizer.detokenize(target_tokens),
result.source_tokens,
target_tokens,
result.confidences,
result.sources,
result.alignment,
result.phrases,
)

@abstractmethod
def save(self) -> None: ...
29 changes: 29 additions & 0 deletions samples/data/smt.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Translation model prefix
-tm tm/src_trg

# Language model
-lm lm/trg.lm

# W parameter (maximum number of translation options to be considered per each source phrase)
-W 10

# S parameter (maximum number of hypotheses that can be stored in each stack)
-S 10

# A parameter (Maximum length in words of the source phrases to be translated)
-A 7

# Degree of non-monotonicity
-nomon 0

# Heuristic function used
-h 6

# Best-first search flag
-be

# Translation model weights
-tmw 0 0.5 1 1 1 1 0 1

# Set online learning parameters (ol_alg, lr_policy, l_stepsize, em_iters, e_par, r_par)
-olp 0 0 1 5 1 0
Loading

0 comments on commit 751a410

Please sign in to comment.