From 2f6fa29c0078000e7165301d6260fe448a03ca67 Mon Sep 17 00:00:00 2001
From: Giacomo Leone Maria Cavallini
 <72698188+GiacomoLeoneMaria@users.noreply.github.com>
Date: Tue, 3 Oct 2023 05:40:12 +0200
Subject: [PATCH] add ItalianPhonemesTokenizer (#7587)

* add ItalianPhonemesTokenizer

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix Italian phonemes

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

---------

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>
---
 .../tokenizers/text_to_speech/ipa_lexicon.py  |  7 +-
 .../text_to_speech/tts_tokenizers.py          | 73 ++++++++++++++++++-
 .../text_to_speech/test_tts_tokenizers.py     | 16 ++++
 3 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
index 2e1bb359102b..338b3536519b 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -88,7 +88,7 @@
         'ɢ','ʛ','ɦ','ɧ','ħ','ɥ','ʜ','ɨ','ɬ','ɫ','ɮ','ʟ',
         'ɱ','ɯ','ɰ','ɳ','ɵ','ɸ','œ','ɶ','ʘ','ɺ','ɻ','ʀ','ʁ',
         'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ',
-        'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ'
+        'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː',
     ),
 }
 
@@ -181,7 +181,10 @@ def get_ipa_punctuation_list(locale):
                 '↑',
                 '→',
                 '↗',
-                '↘,',
+                '↘',
+                '”',
+                '’',
+                '-',
             ]
         )
     elif locale == "es-ES":
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
index 32f725c9c73f..25b9d88a59dc 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -284,7 +284,7 @@ def __init__(
             non_default_punct_list: List of punctuation marks which will be used instead default.
         """
 
-        it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòù"
+        it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòùó"
         super().__init__(
             chars=it_alphabet,
             punct=punct,
@@ -367,6 +367,77 @@ def encode(self, text):
         return [self._token2id[p] for p in cs]
 
 
+class ItalianPhonemesTokenizer(BaseCharsTokenizer):
+    # fmt: off
+    PUNCT_LIST = (
+        ',', '.', '!', '?', '-',
+        ':', ';', '/', '"', '(',
+        ')', '[', ']', '{', '}',
+        '„', '“', '”', '‘', '’', '‒', '—', '«', '»', '‹', '›', '_',
+    )
+    # fmt: on
+
+    def __init__(
+        self,
+        punct=True,
+        apostrophe=True,
+        add_blank_at=None,
+        pad_with_space=False,
+        non_default_punct_list=None,
+        text_preprocessing_func=italian_text_preprocessing,
+    ):
+        """Italian phoneme-based tokenizer.
+        Args:
+            punct: Whether to reserve grapheme for basic punctuation or not.
+            apostrophe: Whether to use apostrophe or not.
+            add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+             if None then no blank in labels.
+            pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+            non_default_punct_list: List of punctuation marks which will be used instead default.
+            text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
+             Currently, it only applies lower() function.
+        """
+
+        it_ipa = "abcdefghijklmnopqrstuvwxyzàèéìòùóæɐɑɔəɚɜɬɹʌʔᵻðŋɛɡɣɪɲɾʃʊʎʒʝβθd͡'t͡'øɒɕɓçɖɘɝɞɟʄɡɠɢʛɦɧħɥʜɨɬɫɮʟɱɯɰɳɵɸœɶʘɺɻʀʁɽʂʈʧʉʋⱱɤʍχʏʑʐʔʡʕʢǀǁǂᵻʃ'ː"
+        super().__init__(
+            chars=it_ipa,
+            punct=punct,
+            apostrophe=apostrophe,
+            add_blank_at=add_blank_at,
+            pad_with_space=pad_with_space,
+            non_default_punct_list=non_default_punct_list,
+            text_preprocessing_func=text_preprocessing_func,
+        )
+
+    def encode(self, text):
+        """See base class."""
+        cs, space, tokens = [], self.tokens[self.space], set(self.tokens)
+
+        text = self.text_preprocessing_func(text)
+        for c in text:
+            # Add space if last one isn't one
+            if c == space and len(cs) > 0 and cs[-1] != space:
+                cs.append(c)
+            # Add next char
+            elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens:
+                cs.append(c)
+            # Add punct
+            elif (c in self.PUNCT_LIST) and self.punct:
+                cs.append(c)
+            # Warn about unknown char
+            elif c != space:
+                logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
+
+        # Remove trailing spaces
+        while cs[-1] == space:
+            cs.pop()
+
+        if self.pad_with_space:
+            cs = [space] + cs + [space]
+
+        return [self._token2id[p] for p in cs]
+
+
 class EnglishPhonemesTokenizer(BaseTokenizer):
     # fmt: off
     PUNCT_LIST = (  # Derived from LJSpeech and "/" additionally
diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
index 62c571bc16b7..bc065e75fa66 100644
--- a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
+++ b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -34,6 +34,10 @@ class TestTTSTokenizers:
         "BUENOS": ["bwˈenos"],
         "DÍAS": ["dˈias"],
     }
+    PHONEME_DICT_IT = {
+        "CIAO": ["tʃˈao"],
+        "MONDO": ["mˈondo"],
+    }
 
     @staticmethod
     def _parse_text(tokenizer, text):
@@ -146,6 +150,18 @@ def test_ipa_tokenizer_de_de(self):
 
         assert chars == expected_output
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_ipa_tokenizer_it_it(self):
+        input_text = "Ciao mondo"
+        expected_output = "tʃˈao mˈondo"
+
+        g2p = IpaG2p(phoneme_dict=self.PHONEME_DICT_IT, locale="it-IT")
+        tokenizer = IPATokenizer(g2p=g2p, locale="it-IT")
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_ipa_tokenizer_en_us(self):