mozilla · eu9ene · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
@@ -92,9 +92,10 @@ modifiers:
   max_word_length: 5 # Maximum word length for each word in the noisy sentence
   max_words: 6 # Maximum number of words in each noisy sentence
 - Tags: 0.05
-  custom_detok_src: {src}
-  custom_detok_trg: {trg}
+  custom_detok_src: "icu:{src}"
+  custom_detok_trg: "icu:{trg}"
   augment: 1
+  tag: 0
   spm_vocab: {vocab}
 seed: 1111
 
@@ -104,13 +105,14 @@ num_fields: 3
 
 #### Tokenization and alignments
 
-`Tags` modifiers requires whitespace or Moses tokenized alignments as input. 
+`Tags` modifiers requires whitespace, Moses or ICU tokenized alignments as input. 
 Marian requires Sentencepiece tokenized alignments and raw text input. 
 To make them compatible `Tags` modifier can remap the alignments in the end using the passed Sentencepiece model `spm_vocab: vocab.spm` (student model use case). 
 If the `spm_vocab` argument is missing `Tags` modifier will remove alignments and output only the parallel sentences (teacher model use case). 
 
-Currently, Moses-tokenized text and its alignments are passed to OpusTrainer (to work around CJK languages where whitespace-based tokenization doesn't make sense). 
-`custom_detok_{src,trg}` OpusTrainer modifiers are applied to detokenize text after inline noise is added. 
+Currently, ICUs-tokenized text and its alignments are passed to OpusTrainer (to work around CJK languages where whitespace-based tokenization doesn't make sense). 
+Whitespaces are reprenseted with a special symbol "▁" to allow for lossless text reconstruction on OpusTrainer side. 
+`custom_detok_icu:{src,trg}` OpusTrainer modifiers are applied to detokenize text after inline noise is added. 
 Then the detokenized text is passed to Marian together with the alignments remapped to SentencePiece tokenization.
 
 ## Models

@@ -32,7 +32,7 @@
 import zstandard
 from tqdm import tqdm
 
-from pipeline.alignments.tokenizer import tokenize_moses
+from pipeline.alignments.tokenizer import tokenize, TokenizerType
 from pipeline.common.logging import get_logger
 
 logger = get_logger("alignments")
@@ -41,6 +41,7 @@
 class Tokenization(Enum):
     spaces = "spaces"
     moses = "moses"
+    icu = "icu"
 
 
 def run(
@@ -63,25 +64,29 @@ def run(
     corpus_src = decompress(corpus_src)
     corpus_trg = decompress(corpus_trg)
 
-    if tokenization == Tokenization.moses:
+    if tokenization == Tokenization.spaces:
+        tokenized_src, tokenized_trg = corpus_src, corpus_trg
+        output_aln = output_path
+    else:
+        ext = f".tok-{tokenization.value}"
         tokenized_src = (
-            corpus_src[: corpus_src.rfind(".")]
-            + ".tok-moses"
-            + corpus_src[corpus_src.rfind(".") :]
+            corpus_src[: corpus_src.rfind(".")] + ext + corpus_src[corpus_src.rfind(".") :]
         )
         tokenized_trg = (
-            corpus_trg[: corpus_trg.rfind(".")]
-            + ".tok-moses"
-            + corpus_trg[corpus_trg.rfind(".") :]
+            corpus_trg[: corpus_trg.rfind(".")] + ext + corpus_trg[corpus_trg.rfind(".") :]
         )
         output_aln = os.path.join(tmp_dir, "aln")
+
+        if tokenization == Tokenization.moses:
+            tokenizer = TokenizerType.fast_moses
+        elif tokenization == Tokenization.icu:
+            tokenizer = TokenizerType.icu
+        else:
+            raise ValueError(f"Unrecognized tokenization type {tokenization}")
         # C++ tokenizer can process 100k sentences per second on a single core,
         # so the chunks to parallelize things should be large enough to increase throughput
-        tokenize_moses(corpus_src, tokenized_src, src, sentences_per_chunk=500000)
-        tokenize_moses(corpus_trg, tokenized_trg, trg, sentences_per_chunk=500000)
-    else:
-        tokenized_src, tokenized_trg = corpus_src, corpus_trg
-        output_aln = output_path
+        tokenize(corpus_src, tokenized_src, src, sentences_per_chunk=500000, tokenizer=tokenizer)
+        tokenize(corpus_trg, tokenized_trg, trg, sentences_per_chunk=500000, tokenizer=tokenizer)
 
     fwd_path, rev_path = align(
         corpus_src=tokenized_src,
@@ -101,7 +106,7 @@ def run(
             priors_output_path=priors_output_path,
         )
 
-    if tokenization == Tokenization.moses:
+    if tokenization != Tokenization.spaces:
         if output_tokenized:
             logger.info("Saving tokenized corpus")
             # Copy tokenized corpus to output directory
@@ -261,12 +266,12 @@ def remap(
     output_aln_path: str,
 ) -> None:
     """
-    Remaps alignments that were calculated for Moses-tokenized corpus to whitespace-tokenized ones.
+    Remaps alignments that were calculated for tokenized corpus to whitespace-tokenized ones.
     :param src_path: path to whitespace-tokenized sentences in source language
     :param trg_path: path to whitespace-tokenized sentences in target language
-    :param tok_src_path: path to Moses-tokenized sentences in source language
-    :param tok_trg_path: path to Moses-tokenized sentences in target language
-    :param aln_path: path to the alignments calculated for Moses-tokenized corpus
+    :param tok_src_path: path to tokenized sentences in source language
+    :param tok_trg_path: path to tokenized sentences in target language
+    :param aln_path: path to the alignments calculated for tokenized corpus
     :param output_aln_path: path to output alignments file remapped to whitespace-tokenized corpus
     """
     logger.info("Remapping alignments to whitespace tokenization")
@@ -390,7 +395,7 @@ def main() -> None:
         choices=list(Tokenization),
         default=Tokenization.spaces,
         help="Use the specified tokenization method. Default is `spaces` which means no tokenization will be applied. "
-        "It remaps the alignments back to whitespace tokenized ones if the `moses` tokenization is used.",
+        "It remaps the alignments back to whitespace tokenized ones if another tokenization method is used.",
     )
     parser.add_argument(
         "--output_tokenized",

@@ -3,3 +3,4 @@ opus-fast-mosestokenizer==0.0.8.5
 tqdm
 requests==2.31.0
 zstandard
+PyICU==2.8.1
@@ -18,6 +18,8 @@ numpy==1.26.4
     # via eflomal
 opus-fast-mosestokenizer==0.0.8.5
     # via -r pipeline/alignments/requirements/alignments.in
+pyicu==2.8.1
+    # via -r pipeline/alignments/requirements/alignments.in
 requests==2.31.0
     # via -r pipeline/alignments/requirements/alignments.in
 tqdm==4.66.4

@@ -4,15 +4,22 @@
 
 Example:
   python pipeline/alignments/tokenizer.py --input_path=data/datasets/news.2023.en.shuffled.deduped \
-    --output_path=data/datasets/news.2023.en.shuffled.deduped.tok-moses --lang=en --chunk_size=500000
+    --output_path=data/datasets/news.2023.en.shuffled.deduped.tok-icu --lang=en --chunk_size=500000 --tokenizer=icu
 
 Using C++ opus-fast-mosestokenizer sometimes requires specifying LD_LIBRARY_PATH before starting the Python process
 see https://github.com/Helsinki-NLP/opus-fast-mosestokenizer/issues/6
 export LD_LIBRARY_PATH=.../<you-python-env>/lib/python3.10/site-packages/mosestokenizer/lib
 
+Using ICU tokenizer requires installing it with `apt-get install python3-icu`,
+see more installation instructions here: https://pypi.org/project/PyICU/
+
+Whitespaces are ignored by Moses based tokenizers and preserved and replaced with a special token "▁" by ICU tokenizer
+which allows lossless reconstruction of the original text on detokenization.
+
 """
 import argparse
 import multiprocessing
+from enum import Enum
 from typing import List
 
 from tqdm import tqdm
@@ -22,6 +29,99 @@
 logger = get_logger("tokenizer")
 
 
+class TokenizerType(Enum):
+    fast_moses = "fast_moses"
+    sacre_moses = "sacre_moses"
+    icu = "icu"
+
+
+class Tokenizer:
+    def __init__(self, lang: str):
+        self.lang = lang
+
+    def tokenize(self, text: str) -> List[str]:
+        pass
+
+    def detokenize(self, tokens: List[str]) -> str:
+        pass
+
+
+class FastMosesTokenizer(Tokenizer):
+    """
+    Uses Moses tokenizer https://github.com/Helsinki-NLP/opus-fast-mosestokenizer
+    """
+
+    def __init__(self, lang):
+        super().__init__(lang)
+        from mosestokenizer import MosesTokenizer
+
+        try:
+            self.tokenizer = MosesTokenizer(lang)
+        except RuntimeError as err:
+            msg = str(err)
+            if "No known abbreviations for language" in msg:
+                # Fall-back to English if the language is not found
+                self.tokenizer = MosesTokenizer("en")
+            else:
+                raise err
+
+    def tokenize(self, text: str) -> List[str]:
+        return self.tokenizer.tokenize(text)
+
+    def detokenize(self, tokens: List[str]) -> str:
+        return self.tokenizer.detokenize(tokens)
+
+
+class SacreMosesTokenizer(Tokenizer):
+    """
+    Uses Moses tokenizer https://github.com/hplt-project/sacremoses
+    """
+
+    def __init__(self, lang):
+        super().__init__(lang)
+        import sacremoses
+
+        self.tokenizer = sacremoses.MosesTokenizer(lang)
+        self.detokenizer = sacremoses.MosesDetokenizer(lang)
+
+    def tokenize(self, text: str) -> List[str]:
+        return self.tokenizer.tokenize(text)
+
+    def detokenize(self, tokens: List[str]) -> str:
+        return self.detokenizer.detokenize(tokens)
+
+
+class IcuTokenizer(Tokenizer):
+    """
+    Uses ICU based word segmenter https://pypi.org/project/PyICU/
+    Preserves whitespaces as tokens by replacing them with a special character "▁".
+    Allows lossless reconstruction of the original text on detokenization.
+    """
+
+    # Same character is used by SentencePiece
+    SPACE_TOKEN = "▁"
+
+    def tokenize(self, text: str) -> List[str]:
+        from icu import BreakIterator, Locale
+
+        bi = BreakIterator.createWordInstance(Locale(self.lang))
+        bi.setText(text)
+
+        tokens = []
+        start = bi.first()
+        for end in bi:
+            token = text[start:end]
+            if (
+                token and token != "\n"
+            ):  # exclude empty tokens, but leave whitespaces and replace them with a special token
+                tokens.append(token.replace(" ", self.SPACE_TOKEN))
+            start = end
+        return tokens
+
+    def detokenize(self, tokens: List[str]) -> str:
+        return "".join(tokens).replace(self.SPACE_TOKEN, " ")
+
+
 def _read_file_in_chunks(file_path, chunk_size):
     with open(file_path, "r", encoding="utf-8") as file:
         while True:
@@ -32,18 +132,16 @@ def _read_file_in_chunks(file_path, chunk_size):
 
 
 def _tokenize_lines(params) -> List[str]:
-    lines, lang = params
-    from mosestokenizer import MosesTokenizer
-
-    try:
-        tokenizer = MosesTokenizer(lang)
-    except RuntimeError as err:
-        msg = str(err)
-        if "No known abbreviations for language" in msg:
-            # Fall-back to English if the language is not found
-            tokenizer = MosesTokenizer("en")
-        else:
-            raise err
+    lines, lang, tok_type = params
+
+    if tok_type == TokenizerType.fast_moses:
+        tokenizer = FastMosesTokenizer(lang)
+    elif tok_type == TokenizerType.sacre_moses:
+        tokenizer = SacreMosesTokenizer(lang)
+    elif tok_type == TokenizerType.icu:
+        tokenizer = IcuTokenizer(lang)
+    else:
+        raise ValueError(f"Unknown tokenizer type: {tok_type}")
 
     tokenized = []
     for line in lines:
@@ -52,8 +150,12 @@ def _tokenize_lines(params) -> List[str]:
     return tokenized
 
 
-def tokenize_moses(
-    input_path: str, output_path: str, lang: str, sentences_per_chunk: int = 100000
+def tokenize(
+    input_path: str,
+    output_path: str,
+    lang: str,
+    tokenizer: TokenizerType,
+    sentences_per_chunk: int = 100000,
 ) -> None:
     logger.info(f"Tokenizing {input_path} with Moses tokenizer")
 
@@ -65,7 +167,7 @@ def tokenize_moses(
             # ~100K sentences per second on a single core
             for tokenized_chunk in pool.imap(
                 _tokenize_lines,
-                ((ch, lang) for ch in chunks),
+                ((ch, lang, tokenizer) for ch in chunks),
             ):
                 output_file.write("\n".join(tokenized_chunk) + "\n")
                 pbar.update(len(tokenized_chunk))
@@ -104,5 +206,19 @@ def tokenize_moses(
         default=None,
         help="Number of lines to process per chunk",
     )
+    parser.add_argument(
+        "--tokenizer",
+        metavar="TOKENIZER",
+        type=TokenizerType,
+        choices=TokenizerType,
+        default=TokenizerType.icu,
+        help="Tokenization method",
+    )
     args = parser.parse_args()
-    tokenize_moses(args.input_path, args.output_path, args.lang, args.chunk_size)
+    tokenize(
+        input_path=args.input_path,
+        output_path=args.output_path,
+        lang=args.lang,
+        sentences_per_chunk=args.chunk_size,
+        tokenizer=args.tokenizer,
+    )
@@ -1,5 +1,5 @@
-# use the latest main, switch to PyPi when released
-git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
+# ICU tokenizer commit
+git+https://github.com/mozilla/OpusTrainer.git@ee534f34e2267c751f4686d7bae27673564c547b
 simalign==0.4
 mtdata==0.4.1
 psutil==6.0.0

@@ -62,7 +62,7 @@ numpy==1.26.4
     #   transformers
 opencc==1.1.9
     # via -r pipeline/data/requirements/data.in
-opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
+opustrainer @ git+https://github.com/mozilla/OpusTrainer.git@ee534f34e2267c751f4686d7bae27673564c547b
     # via -r pipeline/data/requirements/data.in
 packaging==24.1
     # via

@@ -21,8 +21,9 @@ modifiers:
 # Tags modifier has to be the last one to retokenize the alignments
 - Tags: 0.005
   augment: 1
-  custom_detok_src: {src}
-  custom_detok_trg: {trg}
+  tag: 0
+  custom_detok_src: "icu:{src}"
+  custom_detok_trg: "icu:{trg}"
   spm_vocab: {vocab}
 
 seed: 1111

@@ -26,8 +26,9 @@ modifiers:
 # Tags modifier has to be the last one to retokenize the alignments
 - Tags: 0.005
   augment: 1
-  custom_detok_src: {src}
-  custom_detok_trg: {trg}
+  tag: 0
+  custom_detok_src: "icu:{src}"
+  custom_detok_trg: "icu:{trg}"
   spm_vocab: {vocab}
 
 seed: 1111

@@ -30,9 +30,10 @@ modifiers:
 # we don't use alignments for teacher training
 # Tags modifier has to be the last one to remove the alignments
 - Tags: 0.005
-  custom_detok_src: {src}
-  custom_detok_trg: {trg}
+  custom_detok_src: "icu:{src}"
+  custom_detok_trg: "icu:{trg}"
   augment: 1
+  tag: 0
 
 
 # random seed should be different for different teacher models