French g2p with pronunciation dictionary (#7601)

* enable prondict g2p for fr * add processing for contractions * update ipa lexicon * debug and add tests * fix alphabet casing * fix tokenizer utils tests * add ipa tokenizer test for fr Signed-off-by: Mariana Graterol Fuenmayor <[email protected]> --------- Signed-off-by: Mariana Graterol Fuenmayor <[email protected]> Signed-off-by: Mariana <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xuesong Yang <[email protected]>
NVIDIA · Oct 20, 2023 · 5895a57 · 5895a57
1 parent d1b0162
commit 5895a57
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 3 deletions.
diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -15,7 +15,7 @@
 
 # fmt: off
 
-SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT"]
+SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"]
 
 DEFAULT_PUNCTUATION = (
     ',', '.', '!', '?', '-',
@@ -48,6 +48,13 @@
         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
         'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
     ),
+    "fr-FR": (
+        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 
+        'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 
+        'U', 'V', 'W', 'X', 'Y', 'Z', 'À', 'Â', 'Ä', 'Æ', 
+        'Ç', 'È', 'É', 'Ê', 'Ë', 'Í', 'Î', 'Ï', 'Ñ', 'Ô', 
+        'Ö', 'Ù', 'Û', 'Ü', 'Ō', 'Œ',
+    ),
     "it-IT": (
         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
@@ -77,6 +84,13 @@
         'ɒ', 'ɔ', 'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ',
         'ʊ', 'ʌ', 'ʒ', '̃', 'θ'
     ),
+    "fr-FR": (
+        'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 
+        'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 
+        'y', 'z', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɒ', 'ɔ', 
+        'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɲ', 'ɹ', 'ʁ', 'ʃ', 'ʊ', 
+        'ʌ', 'ʒ', 'θ', 'ː', '̃'
+    ),
     "it-IT": (
         'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l',
         'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
@@ -143,7 +157,7 @@ def get_ipa_punctuation_list(locale):
     punct_set = set(DEFAULT_PUNCTUATION)
     # TODO @xueyang: verify potential mismatches with locale-specific punctuation sets used
     #  in nemo_text_processing.text_normalization.en.taggers.punctuation.py
-    if locale in ["de-DE", "es-ES", "it-IT"]:
+    if locale in ["de-DE", "es-ES", "it-IT", "fr-FR"]:
         # ref: https://en.wikipedia.org/wiki/Guillemet#Uses
         punct_set.update(['«', '»', '‹', '›'])
     if locale == "de-DE":
@@ -190,6 +204,20 @@ def get_ipa_punctuation_list(locale):
     elif locale == "es-ES":
         # ref: https://en.wikipedia.org/wiki/Spanish_orthography#Punctuation
         punct_set.update(['¿', '¡'])
+    elif locale == "fr-FR":
+        punct_set.update(
+            [
+                '–',  # en dash, U+2013, decimal 8211
+                '“',  # left double quotation mark, U+201C, decimal 8220
+                '”',  # right double quotation mark, U+201D, decimal 8221
+                '…',  # horizontal ellipsis, U+2026, decimal 8230
+                '̀',  # combining grave accent, U+0300, decimal 768
+                '́',  # combining acute accent, U+0301, decimal 769
+                '̂',  # combining circumflex accent, U+0302, decimal 770
+                '̈',  # combining diaeresis, U+0308, decimal 776
+                '̧',  # combining cedilla, U+0327, decimal 807
+            ]
+        )
 
     punct_list = sorted(list(punct_set))
     return punct_list
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
@@ -19,6 +19,7 @@
 from typing import List, Tuple
 
 __all__ = [
+    "french_text_preprocessing",
     "chinese_text_preprocessing",
     "english_text_preprocessing",
     "any_locale_text_preprocessing",
@@ -196,3 +197,7 @@ def italian_text_preprocessing(text: str) -> str:
 
 def chinese_text_preprocessing(text: str) -> str:
     return text
+
+
+def french_text_preprocessing(text: str) -> str:
+    return text.lower()
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -28,6 +28,7 @@
     any_locale_text_preprocessing,
     chinese_text_preprocessing,
     english_text_preprocessing,
+    french_text_preprocessing,
     italian_text_preprocessing,
     spanish_text_preprocessing,
 )
@@ -268,6 +269,35 @@ def __init__(
         )
 
 
+class FrenchCharsTokenizer(BaseCharsTokenizer):
+
+    PUNCT_LIST = get_ipa_punctuation_list("fr-FR")
+
+    def __init__(
+        self, punct=True, apostrophe=True, add_blank_at=None, pad_with_space=False, non_default_punct_list=None,
+    ):
+        """French grapheme tokenizer.
+        Args:
+            punct: Whether to reserve grapheme for basic punctuation or not.
+            apostrophe: Whether to use apostrophe or not.
+            add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+            if None then no blank in labels.
+            pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+            non_default_punct_list: List of punctuation marks which will be used instead default.
+        """
+
+        fr_alphabet = get_grapheme_character_set(locale="fr-FR", case="lower")
+        super().__init__(
+            chars=fr_alphabet,
+            punct=punct,
+            apostrophe=apostrophe,
+            add_blank_at=add_blank_at,
+            pad_with_space=pad_with_space,
+            non_default_punct_list=non_default_punct_list,
+            text_preprocessing_func=french_text_preprocessing,
+        )
+
+
 class ItalianCharsTokenizer(BaseCharsTokenizer):
     PUNCT_LIST = get_ipa_punctuation_list("it-IT")
 
@@ -619,7 +649,7 @@ def __init__(
         Args:
             g2p: Grapheme to phoneme module, should be IpaG2p or some subclass thereof.
             locale: Locale used to determine default text processing logic and punctuation.
-                Supports ["en-US", "de-DE", "es-ES"]. Defaults to "en-US".
+                Supports ["en-US", "de-DE", "es-ES", "fr-FR"]. Defaults to "en-US".
                 Specify None if implementing custom logic for a new locale.
             punct: Whether to reserve grapheme for basic punctuation or not.
             non_default_punct_list: List of punctuation marks which will be used instead default, if any.

diff --git a/nemo/collections/tts/g2p/models/i18n_ipa.py b/nemo/collections/tts/g2p/models/i18n_ipa.py
@@ -405,6 +405,27 @@ def parse_one_word(self, word: str) -> Tuple[List[str], bool]:
                     else:
                         return self.phoneme_dict[word_found][0] + ["z"], True
 
+        if self.locale == "fr-FR":
+            # contracted prefix (with apostrophe) - not in phoneme dict
+            contractions_g = ['l', 'c', 'd', 'j', 'm', 'n', 'qu', 's', 't', 'puisqu', 'lorsqu', 'jusqu']
+            contractions_p = ['l', 's', 'd', 'ʒ', 'm', 'n', 'k', 's', 't', 'pyisk', 'loʁsk', 'ʒysk']
+
+            for cont_g, cont_p in zip(contractions_g, contractions_p):
+                starter = cont_g + "'"
+                if len(word) > 2 and (word.startswith(starter) or word.startswith(starter.upper())):
+                    word_found = None
+                    if (word not in self.phoneme_dict) and (word.upper() not in self.phoneme_dict):
+                        start_index = len(starter)
+                        if word[start_index:] in self.phoneme_dict:
+                            word_found = word[start_index:]
+                        elif word[start_index:].upper() in self.phoneme_dict:
+                            word_found = word[start_index:].upper()
+
+                    if word_found is not None and (
+                        not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word_found)
+                    ):
+                        return [c for c in cont_p] + self.phoneme_dict[word_found][0], True
+
         # For the words that have a single pronunciation, directly look it up in the phoneme_dict; for the
         # words that have multiple pronunciation variants, if we don't want to ignore them, then directly choose their
         # first pronunciation variant as the target phonemes.

diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tokenizer_utils.py b/tests/collections/common/tokenizers/text_to_speech/test_tokenizer_utils.py
@@ -16,6 +16,7 @@
 from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
     any_locale_word_tokenize,
     english_word_tokenize,
+    french_text_preprocessing,
 )
 
 
@@ -120,3 +121,57 @@ def test_any_locale_word_tokenize_with_numbers(self):
 
         output = any_locale_word_tokenize(input_text)
         assert output == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_any_locale_word_tokenize_fr(self):
+        input_text = "pomme banane poire"
+        expected_output = self._create_expected_output(["pomme", " ", "banane", " ", "poire"])
+
+        output = any_locale_word_tokenize(input_text)
+        assert output == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_any_locale_word_tokenize_with_accents_fr(self):
+        input_text = "L’hétérogénéité entre les langues est étonnante."
+        expected_output = self._create_expected_output(
+            ["L", "’", "hétérogénéité", " ", "entre", " ", "les", " ", "langues", " ", "est", " ", "étonnante", "."]
+        )
+
+        output = any_locale_word_tokenize(input_text)
+        assert output == expected_output
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_any_locale_word_tokenize_with_numbers(self):
+        input_text = r"Trois fois× quatorze^ et dix ÷ divisé par [films] sur \slash."
+        expected_output = self._create_expected_output(
+            [
+                "Trois",
+                " ",
+                "fois",
+                "× ",
+                "quatorze",
+                "^ ",
+                "et",
+                " ",
+                "dix",
+                " ÷ ",
+                "divisé",
+                " ",
+                "par",
+                " [",
+                "films",
+                "] ",
+                "sur",
+                " \\",
+                "slash",
+                ".",
+            ]
+        )
+
+        output = any_locale_word_tokenize(input_text)
+        print(output)
+        print(expected_output)
+        assert output == expected_output
diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -16,6 +16,7 @@
 
 from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (
     EnglishCharsTokenizer,
+    FrenchCharsTokenizer,
     GermanCharsTokenizer,
     IPATokenizer,
     ItalianCharsTokenizer,
@@ -38,6 +39,11 @@ class TestTTSTokenizers:
         "CIAO": ["tʃˈao"],
         "MONDO": ["mˈondo"],
     }
+    PHONEME_DICT_FR = {
+        "BONJOUR": ["bɔ̃ʒˈuʁ"],
+        "LE": ["lˈə-"],
+        "MONDE": ["mˈɔ̃d"],
+    }
 
     @staticmethod
     def _parse_text(tokenizer, text):
@@ -118,6 +124,18 @@ def test_spanish_chars_tokenizer(self):
         assert chars == expected_output
         assert len(tokens) == len(input_text)
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_french_chars_tokenizer(self):
+        input_text = "Bon après-midi !"
+        expected_output = "bon après-midi !"
+
+        tokenizer = FrenchCharsTokenizer()
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+        assert len(tokens) == len(input_text)
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_ipa_tokenizer(self):
@@ -187,6 +205,18 @@ def test_ipa_tokenizer_es_es(self):
 
         assert chars == expected_output
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_ipa_tokenizer_fr_fr(self):
+        input_text = "Bonjour le monde"
+        expected_output = "bɔ̃ʒˈuʁ lˈə- mˈɔ̃d"
+
+        g2p = IpaG2p(phoneme_dict=self.PHONEME_DICT_FR, locale="fr-FR")
+        tokenizer = IPATokenizer(g2p=g2p, locale="fr-FR")
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_ipa_tokenizer_fixed_vocab(self):