Skip to content

Commit

Permalink
French g2p with pronunciation dictionary (#7601)
Browse files Browse the repository at this point in the history
* enable prondict g2p for fr
* add processing for contractions
* update ipa lexicon
* debug and add tests
* fix alphabet casing
* fix tokenizer utils tests
* add ipa tokenizer test for fr

Signed-off-by: Mariana Graterol Fuenmayor <[email protected]>

---------

Signed-off-by: Mariana Graterol Fuenmayor <[email protected]>
Signed-off-by: Mariana <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <[email protected]>
  • Loading branch information
3 people authored Oct 20, 2023
1 parent d1b0162 commit 5895a57
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 3 deletions.
32 changes: 30 additions & 2 deletions nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# fmt: off

SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT"]
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"]

DEFAULT_PUNCTUATION = (
',', '.', '!', '?', '-',
Expand Down Expand Up @@ -48,6 +48,13 @@
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
),
"fr-FR": (
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z', 'À', 'Â', 'Ä', 'Æ',
'Ç', 'È', 'É', 'Ê', 'Ë', 'Í', 'Î', 'Ï', 'Ñ', 'Ô',
'Ö', 'Ù', 'Û', 'Ü', 'Ō', 'Œ',
),
"it-IT": (
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
Expand Down Expand Up @@ -77,6 +84,13 @@
'ɒ', 'ɔ', 'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ',
'ʊ', 'ʌ', 'ʒ', '̃', 'θ'
),
"fr-FR": (
'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
'y', 'z', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɒ', 'ɔ',
'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɲ', 'ɹ', 'ʁ', 'ʃ', 'ʊ',
'ʌ', 'ʒ', 'θ', 'ː', '̃'
),
"it-IT": (
'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
Expand Down Expand Up @@ -143,7 +157,7 @@ def get_ipa_punctuation_list(locale):
punct_set = set(DEFAULT_PUNCTUATION)
# TODO @xueyang: verify potential mismatches with locale-specific punctuation sets used
# in nemo_text_processing.text_normalization.en.taggers.punctuation.py
if locale in ["de-DE", "es-ES", "it-IT"]:
if locale in ["de-DE", "es-ES", "it-IT", "fr-FR"]:
# ref: https://en.wikipedia.org/wiki/Guillemet#Uses
punct_set.update(['«', '»', '‹', '›'])
if locale == "de-DE":
Expand Down Expand Up @@ -190,6 +204,20 @@ def get_ipa_punctuation_list(locale):
elif locale == "es-ES":
# ref: https://en.wikipedia.org/wiki/Spanish_orthography#Punctuation
punct_set.update(['¿', '¡'])
elif locale == "fr-FR":
punct_set.update(
[
'–', # en dash, U+2013, decimal 8211
'“', # left double quotation mark, U+201C, decimal 8220
'”', # right double quotation mark, U+201D, decimal 8221
'…', # horizontal ellipsis, U+2026, decimal 8230
'̀', # combining grave accent, U+0300, decimal 768
'́', # combining acute accent, U+0301, decimal 769
'̂', # combining circumflex accent, U+0302, decimal 770
'̈', # combining diaeresis, U+0308, decimal 776
'̧', # combining cedilla, U+0327, decimal 807
]
)

punct_list = sorted(list(punct_set))
return punct_list
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import List, Tuple

__all__ = [
"french_text_preprocessing",
"chinese_text_preprocessing",
"english_text_preprocessing",
"any_locale_text_preprocessing",
Expand Down Expand Up @@ -196,3 +197,7 @@ def italian_text_preprocessing(text: str) -> str:

def chinese_text_preprocessing(text: str) -> str:
return text


def french_text_preprocessing(text: str) -> str:
return text.lower()
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
any_locale_text_preprocessing,
chinese_text_preprocessing,
english_text_preprocessing,
french_text_preprocessing,
italian_text_preprocessing,
spanish_text_preprocessing,
)
Expand Down Expand Up @@ -268,6 +269,35 @@ def __init__(
)


class FrenchCharsTokenizer(BaseCharsTokenizer):

PUNCT_LIST = get_ipa_punctuation_list("fr-FR")

def __init__(
self, punct=True, apostrophe=True, add_blank_at=None, pad_with_space=False, non_default_punct_list=None,
):
"""French grapheme tokenizer.
Args:
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
"""

fr_alphabet = get_grapheme_character_set(locale="fr-FR", case="lower")
super().__init__(
chars=fr_alphabet,
punct=punct,
apostrophe=apostrophe,
add_blank_at=add_blank_at,
pad_with_space=pad_with_space,
non_default_punct_list=non_default_punct_list,
text_preprocessing_func=french_text_preprocessing,
)


class ItalianCharsTokenizer(BaseCharsTokenizer):
PUNCT_LIST = get_ipa_punctuation_list("it-IT")

Expand Down Expand Up @@ -619,7 +649,7 @@ def __init__(
Args:
g2p: Grapheme to phoneme module, should be IpaG2p or some subclass thereof.
locale: Locale used to determine default text processing logic and punctuation.
Supports ["en-US", "de-DE", "es-ES"]. Defaults to "en-US".
Supports ["en-US", "de-DE", "es-ES", "fr-FR"]. Defaults to "en-US".
Specify None if implementing custom logic for a new locale.
punct: Whether to reserve grapheme for basic punctuation or not.
non_default_punct_list: List of punctuation marks which will be used instead default, if any.
Expand Down
21 changes: 21 additions & 0 deletions nemo/collections/tts/g2p/models/i18n_ipa.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,27 @@ def parse_one_word(self, word: str) -> Tuple[List[str], bool]:
else:
return self.phoneme_dict[word_found][0] + ["z"], True

if self.locale == "fr-FR":
# contracted prefix (with apostrophe) - not in phoneme dict
contractions_g = ['l', 'c', 'd', 'j', 'm', 'n', 'qu', 's', 't', 'puisqu', 'lorsqu', 'jusqu']
contractions_p = ['l', 's', 'd', 'ʒ', 'm', 'n', 'k', 's', 't', 'pyisk', 'loʁsk', 'ʒysk']

for cont_g, cont_p in zip(contractions_g, contractions_p):
starter = cont_g + "'"
if len(word) > 2 and (word.startswith(starter) or word.startswith(starter.upper())):
word_found = None
if (word not in self.phoneme_dict) and (word.upper() not in self.phoneme_dict):
start_index = len(starter)
if word[start_index:] in self.phoneme_dict:
word_found = word[start_index:]
elif word[start_index:].upper() in self.phoneme_dict:
word_found = word[start_index:].upper()

if word_found is not None and (
not self.ignore_ambiguous_words or self.is_unique_in_phoneme_dict(word_found)
):
return [c for c in cont_p] + self.phoneme_dict[word_found][0], True

# For the words that have a single pronunciation, directly look it up in the phoneme_dict; for the
# words that have multiple pronunciation variants, if we don't want to ignore them, then directly choose their
# first pronunciation variant as the target phonemes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import (
any_locale_word_tokenize,
english_word_tokenize,
french_text_preprocessing,
)


Expand Down Expand Up @@ -120,3 +121,57 @@ def test_any_locale_word_tokenize_with_numbers(self):

output = any_locale_word_tokenize(input_text)
assert output == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_any_locale_word_tokenize_fr(self):
input_text = "pomme banane poire"
expected_output = self._create_expected_output(["pomme", " ", "banane", " ", "poire"])

output = any_locale_word_tokenize(input_text)
assert output == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_any_locale_word_tokenize_with_accents_fr(self):
input_text = "L’hétérogénéité entre les langues est étonnante."
expected_output = self._create_expected_output(
["L", "’", "hétérogénéité", " ", "entre", " ", "les", " ", "langues", " ", "est", " ", "étonnante", "."]
)

output = any_locale_word_tokenize(input_text)
assert output == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_any_locale_word_tokenize_with_numbers(self):
input_text = r"Trois fois× quatorze^ et dix ÷ divisé par [films] sur \slash."
expected_output = self._create_expected_output(
[
"Trois",
" ",
"fois",
"× ",
"quatorze",
"^ ",
"et",
" ",
"dix",
" ÷ ",
"divisé",
" ",
"par",
" [",
"films",
"] ",
"sur",
" \\",
"slash",
".",
]
)

output = any_locale_word_tokenize(input_text)
print(output)
print(expected_output)
assert output == expected_output
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (
EnglishCharsTokenizer,
FrenchCharsTokenizer,
GermanCharsTokenizer,
IPATokenizer,
ItalianCharsTokenizer,
Expand All @@ -38,6 +39,11 @@ class TestTTSTokenizers:
"CIAO": ["tʃˈao"],
"MONDO": ["mˈondo"],
}
PHONEME_DICT_FR = {
"BONJOUR": ["bɔ̃ʒˈuʁ"],
"LE": ["lˈə-"],
"MONDE": ["mˈɔ̃d"],
}

@staticmethod
def _parse_text(tokenizer, text):
Expand Down Expand Up @@ -118,6 +124,18 @@ def test_spanish_chars_tokenizer(self):
assert chars == expected_output
assert len(tokens) == len(input_text)

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_french_chars_tokenizer(self):
input_text = "Bon après-midi !"
expected_output = "bon après-midi !"

tokenizer = FrenchCharsTokenizer()
chars, tokens = self._parse_text(tokenizer, input_text)

assert chars == expected_output
assert len(tokens) == len(input_text)

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_ipa_tokenizer(self):
Expand Down Expand Up @@ -187,6 +205,18 @@ def test_ipa_tokenizer_es_es(self):

assert chars == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_ipa_tokenizer_fr_fr(self):
input_text = "Bonjour le monde"
expected_output = "bɔ̃ʒˈuʁ lˈə- mˈɔ̃d"

g2p = IpaG2p(phoneme_dict=self.PHONEME_DICT_FR, locale="fr-FR")
tokenizer = IPATokenizer(g2p=g2p, locale="fr-FR")
chars, tokens = self._parse_text(tokenizer, input_text)

assert chars == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_ipa_tokenizer_fixed_vocab(self):
Expand Down

0 comments on commit 5895a57

Please sign in to comment.