From 251d3343bc188e5525af5659afff0eb1588942af Mon Sep 17 00:00:00 2001
From: Huu Tuong Tu <83907151+huutuongtu@users.noreply.github.com>
Date: Fri, 26 Jul 2024 15:24:48 +0700
Subject: [PATCH] [TTS][Vietnamese] Add VietnameseCharsTokenizer (#9665)

* Update tts_tokenizers.py
* Update tokenizer_utils.py
* Update test_tts_tokenizers.py
* Apply isort and black reformatting

Signed-off-by: huutuongtu <huutuongtu@users.noreply.github.com>

* Signed-off-by: Tu [huutu12312vn@gmail.com](mailto:huutu12312vn@gmail.com)

* Update ipa_lexicon.py - Signed-off-by: Tu [huutu12312vn@gmail.com](mailto:huutu12312vn@gmail.com)

Signed-off-by: XuesongYang <XuesongYang@users.noreply.github.com>

---------

Signed-off-by: huutuongtu <huutuongtu@users.noreply.github.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: XuesongYang <XuesongYang@users.noreply.github.com>
Co-authored-by: huutuongtu <huutuongtu@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: XuesongYang <XuesongYang@users.noreply.github.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>
---
 .../tokenizers/text_to_speech/ipa_lexicon.py  | 29 +++++++++++++-
 .../text_to_speech/tokenizer_utils.py         |  5 +++
 .../text_to_speech/tts_tokenizers.py          | 38 +++++++++++++++++++
 .../text_to_speech/test_tts_tokenizers.py     | 13 +++++++
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
index f4081735eb71..6b7677431fc7 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -15,7 +15,9 @@
 
 # fmt: off
 
-SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"]
+
+SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN"]
+
 
 DEFAULT_PUNCTUATION = (
     ',', '.', '!', '?', '-',
@@ -48,6 +50,19 @@
         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
         'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
     ),
+    # ref: https://en.wikipedia.org/wiki/Vietnamese_alphabet
+    "vi-VN": (
+        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 
+        'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 
+        'U', 'V', 'W', 'X', 'Y', 'Z', 'Đ', 'Á', 'À', 'Ã', 
+        'Ả', 'Ạ', 'Ă', 'Ắ', 'Ằ', 'Ẵ', 'Ẳ', 'Ặ', 'Â', 'Ấ', 
+        'Ầ', 'Ẫ', 'Ẩ', 'Ậ', 'Ó', 'Ò', 'Õ', 'Ỏ', 'Ọ', 'Ô', 
+        'Ố', 'Ồ', 'Ỗ', 'Ổ', 'Ộ', 'Ơ', 'Ớ', 'Ờ', 'Ỡ', 'Ở', 
+        'Ợ', 'É', 'È', 'Ẽ', 'Ẻ', 'Ẹ', 'Ê', 'Ế', 'Ề', 'Ễ', 
+        'Ể', 'Ệ', 'Ú', 'Ù', 'Ũ', 'Ủ', 'Ụ', 'Ư', 'Ứ', 'Ừ', 
+        'Ữ', 'Ử', 'Ự', 'Í', 'Ì', 'Ĩ', 'Ỉ', 'Ị', 'Ý', 'Ỳ', 
+        'Ỹ', 'Ỷ', 'Ỵ',
+    ),
     "fr-FR": (
         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 
         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 
@@ -104,6 +119,18 @@
         'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ',
         'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː',
     ),
+    "vi-VN": (
+        'a', 'ə', 'ɛ', 'e', 'i', 'o', 'ɔ', 'u', 'ɨ',
+        'b', 'c', 'z', 'j', 'd', 'g', 'h', 'x', 'l',
+        'm', 'n', 'ŋ', 'ɲ', 'p', 'f', 'w', 'r', 's',
+        'ʃ', 't', 'ʈ', 'ʂ', 'v', 'ʔ', 'ɓ', 'ɗ', 'ɣ',
+        'k', 'ʰ', 'ʷ', 'ɕ', 'ʑ', 'ʝ', '̚', '̟', 't͡',
+        '˧', 'ː', 'ɯ', '̀', '̄', '̌', '̂', 'ˀ', '͡', '˥',
+        '˩', '̤', '˨', 'ɹ', 'ʲ', '̯', 'ă', 'ə̆', 'ǐ',
+        '˦', 'æ', 'ɐ',
+        'ɜ', 'ɡ', 'ɪ', 'ɬ' 'ɾ', 'ʊ', 'ʌ', 'ʒ', '̃',
+        '̩', 'θ', 'ᵻ',
+    ),
 }
 
 GRAPHEME_CHARACTER_CASES = ["upper", "lower", "mixed"]
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
index 542b18186846..c82d4f3cce19 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
@@ -24,6 +24,7 @@
     "english_text_preprocessing",
     "any_locale_text_preprocessing",
     "spanish_text_preprocessing",
+    "vietnamese_text_preprocessing",
     "italian_text_preprocessing",
     "any_locale_word_tokenize",
     "english_word_tokenize",
@@ -201,3 +202,7 @@ def chinese_text_preprocessing(text: str) -> str:
 
 def french_text_preprocessing(text: str) -> str:
     return text.lower()
+
+
+def vietnamese_text_preprocessing(text: str) -> str:
+    return text.lower()
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
index 4998fbba1ac9..6332c91cad46 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -31,6 +31,7 @@
     french_text_preprocessing,
     italian_text_preprocessing,
     spanish_text_preprocessing,
+    vietnamese_text_preprocessing,
 )
 from nemo.utils import logging
 from nemo.utils.decorators import experimental
@@ -202,6 +203,43 @@ def __init__(
         )
 
 
+class VietnameseCharsTokenizer(BaseCharsTokenizer):
+
+    _LOCALE = "vi-VN"
+    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
+
+    def __init__(
+        self,
+        chars=_CHARSET_STR,
+        punct=True,
+        apostrophe=True,
+        add_blank_at=None,
+        pad_with_space=False,
+        non_default_punct_list=None,
+        text_preprocessing_func=vietnamese_text_preprocessing,
+    ):
+        """Vietnamese grapheme tokenizer.
+        Args:
+            punct: Whether to reserve grapheme for basic punctuation or not.
+            apostrophe: Whether to use apostrophe or not.
+            add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+            if None then no blank in labels.
+            pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+            non_default_punct_list: List of punctuation marks which will be used instead default.
+            text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. By default, it
+            would keep any word lowercase.
+        """
+        super().__init__(
+            chars=chars,
+            punct=punct,
+            apostrophe=apostrophe,
+            add_blank_at=add_blank_at,
+            pad_with_space=pad_with_space,
+            non_default_punct_list=non_default_punct_list,
+            text_preprocessing_func=vietnamese_text_preprocessing,
+        )
+
+
 class GermanCharsTokenizer(BaseCharsTokenizer):
 
     _LOCALE = "de-DE"
diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
index 2e2f9bdaaf36..2023d31696b1 100644
--- a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
+++ b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -21,6 +21,7 @@
     IPATokenizer,
     ItalianCharsTokenizer,
     SpanishCharsTokenizer,
+    VietnameseCharsTokenizer,
 )
 from nemo.collections.tts.g2p.models.i18n_ipa import IpaG2p
 
@@ -124,6 +125,18 @@ def test_spanish_chars_tokenizer(self):
         assert chars == expected_output
         assert len(tokens) == len(input_text)
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_vietnamese_chars_tokenizer(self):
+        input_text = "Xin chào các bạn."
+        expected_output = "xin chào các bạn."
+
+        tokenizer = VietnameseCharsTokenizer()
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+        assert len(tokens) == len(input_text)
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_french_chars_tokenizer(self):