From 2f6fa29c0078000e7165301d6260fe448a03ca67 Mon Sep 17 00:00:00 2001 From: Giacomo Leone Maria Cavallini <72698188+GiacomoLeoneMaria@users.noreply.github.com> Date: Tue, 3 Oct 2023 05:40:12 +0200 Subject: [PATCH] add ItalianPhonemesTokenizer (#7587) * add ItalianPhonemesTokenizer Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Italian phonemes Signed-off-by: GiacomoLeoneMaria * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test Signed-off-by: GiacomoLeoneMaria --------- Signed-off-by: GiacomoLeoneMaria Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Sasha Meister --- .../tokenizers/text_to_speech/ipa_lexicon.py | 7 +- .../text_to_speech/tts_tokenizers.py | 73 ++++++++++++++++++- .../text_to_speech/test_tts_tokenizers.py | 16 ++++ 3 files changed, 93 insertions(+), 3 deletions(-) diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py index 2e1bb359102b..338b3536519b 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py +++ b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py @@ -88,7 +88,7 @@ 'ɢ','ʛ','ɦ','ɧ','ħ','ɥ','ʜ','ɨ','ɬ','ɫ','ɮ','ʟ', 'ɱ','ɯ','ɰ','ɳ','ɵ','ɸ','œ','ɶ','ʘ','ɺ','ɻ','ʀ','ʁ', 'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ', - 'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ' + 'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː', ), } @@ -181,7 +181,10 @@ def get_ipa_punctuation_list(locale): '↑', '→', '↗', - '↘,', + '↘', + '”', + '’', + '-', ] ) elif locale == "es-ES": diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index 32f725c9c73f..25b9d88a59dc 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -284,7 +284,7 @@ def __init__( non_default_punct_list: List of punctuation marks which will be used instead default. """ - it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòù" + it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòùó" super().__init__( chars=it_alphabet, punct=punct, @@ -367,6 +367,77 @@ def encode(self, text): return [self._token2id[p] for p in cs] +class ItalianPhonemesTokenizer(BaseCharsTokenizer): + # fmt: off + PUNCT_LIST = ( + ',', '.', '!', '?', '-', + ':', ';', '/', '"', '(', + ')', '[', ']', '{', '}', + '„', '“', '”', '‘', '’', '‒', '—', '«', '»', '‹', '›', '_', + ) + # fmt: on + + def __init__( + self, + punct=True, + apostrophe=True, + add_blank_at=None, + pad_with_space=False, + non_default_punct_list=None, + text_preprocessing_func=italian_text_preprocessing, + ): + """Italian phoneme-based tokenizer. + Args: + punct: Whether to reserve grapheme for basic punctuation or not. + apostrophe: Whether to use apostrophe or not. + add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), + if None then no blank in labels. + pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. + non_default_punct_list: List of punctuation marks which will be used instead default. + text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. + Currently, it only applies lower() function. + """ + + it_ipa = "abcdefghijklmnopqrstuvwxyzàèéìòùóæɐɑɔəɚɜɬɹʌʔᵻðŋɛɡɣɪɲɾʃʊʎʒʝβθd͡'t͡'øɒɕɓçɖɘɝɞɟʄɡɠɢʛɦɧħɥʜɨɬɫɮʟɱɯɰɳɵɸœɶʘɺɻʀʁɽʂʈʧʉʋⱱɤʍχʏʑʐʔʡʕʢǀǁǂᵻʃ'ː" + super().__init__( + chars=it_ipa, + punct=punct, + apostrophe=apostrophe, + add_blank_at=add_blank_at, + pad_with_space=pad_with_space, + non_default_punct_list=non_default_punct_list, + text_preprocessing_func=text_preprocessing_func, + ) + + def encode(self, text): + """See base class.""" + cs, space, tokens = [], self.tokens[self.space], set(self.tokens) + + text = self.text_preprocessing_func(text) + for c in text: + # Add space if last one isn't one + if c == space and len(cs) > 0 and cs[-1] != space: + cs.append(c) + # Add next char + elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens: + cs.append(c) + # Add punct + elif (c in self.PUNCT_LIST) and self.punct: + cs.append(c) + # Warn about unknown char + elif c != space: + logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.") + + # Remove trailing spaces + while cs[-1] == space: + cs.pop() + + if self.pad_with_space: + cs = [space] + cs + [space] + + return [self._token2id[p] for p in cs] + + class EnglishPhonemesTokenizer(BaseTokenizer): # fmt: off PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py index 62c571bc16b7..bc065e75fa66 100644 --- a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py +++ b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py @@ -34,6 +34,10 @@ class TestTTSTokenizers: "BUENOS": ["bwˈenos"], "DÍAS": ["dˈias"], } + PHONEME_DICT_IT = { + "CIAO": ["tʃˈao"], + "MONDO": ["mˈondo"], + } @staticmethod def _parse_text(tokenizer, text): @@ -146,6 +150,18 @@ def test_ipa_tokenizer_de_de(self): assert chars == expected_output + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_ipa_tokenizer_it_it(self): + input_text = "Ciao mondo" + expected_output = "tʃˈao mˈondo" + + g2p = IpaG2p(phoneme_dict=self.PHONEME_DICT_IT, locale="it-IT") + tokenizer = IPATokenizer(g2p=g2p, locale="it-IT") + chars, tokens = self._parse_text(tokenizer, input_text) + + assert chars == expected_output + @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_ipa_tokenizer_en_us(self):