Skip to content

Commit

Permalink
[TTS][Vietnamese] Add VietnameseCharsTokenizer (NVIDIA#9665)
Browse files Browse the repository at this point in the history
* Update tts_tokenizers.py
* Update tokenizer_utils.py
* Update test_tts_tokenizers.py
* Apply isort and black reformatting

Signed-off-by: huutuongtu <[email protected]>

* Signed-off-by: Tu [[email protected]](mailto:[email protected])

* Update ipa_lexicon.py - Signed-off-by: Tu [[email protected]](mailto:[email protected])

Signed-off-by: XuesongYang <[email protected]>

---------

Signed-off-by: huutuongtu <[email protected]>
Signed-off-by: Xuesong Yang <[email protected]>
Signed-off-by: XuesongYang <[email protected]>
Co-authored-by: huutuongtu <[email protected]>
Co-authored-by: Xuesong Yang <[email protected]>
Co-authored-by: XuesongYang <[email protected]>
Signed-off-by: Boxiang Wang <[email protected]>
  • Loading branch information
4 people authored and BoxiangW committed Jul 30, 2024
1 parent 85aaf40 commit 9097031
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 1 deletion.
29 changes: 28 additions & 1 deletion nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

# fmt: off

SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"]

SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN"]


DEFAULT_PUNCTUATION = (
',', '.', '!', '?', '-',
Expand Down Expand Up @@ -48,6 +50,19 @@
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
),
# ref: https://en.wikipedia.org/wiki/Vietnamese_alphabet
"vi-VN": (
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z', 'Đ', 'Á', 'À', 'Ã',
'Ả', 'Ạ', 'Ă', 'Ắ', 'Ằ', 'Ẵ', 'Ẳ', 'Ặ', 'Â', 'Ấ',
'Ầ', 'Ẫ', 'Ẩ', 'Ậ', 'Ó', 'Ò', 'Õ', 'Ỏ', 'Ọ', 'Ô',
'Ố', 'Ồ', 'Ỗ', 'Ổ', 'Ộ', 'Ơ', 'Ớ', 'Ờ', 'Ỡ', 'Ở',
'Ợ', 'É', 'È', 'Ẽ', 'Ẻ', 'Ẹ', 'Ê', 'Ế', 'Ề', 'Ễ',
'Ể', 'Ệ', 'Ú', 'Ù', 'Ũ', 'Ủ', 'Ụ', 'Ư', 'Ứ', 'Ừ',
'Ữ', 'Ử', 'Ự', 'Í', 'Ì', 'Ĩ', 'Ỉ', 'Ị', 'Ý', 'Ỳ',
'Ỹ', 'Ỷ', 'Ỵ',
),
"fr-FR": (
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
Expand Down Expand Up @@ -104,6 +119,18 @@
'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ',
'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː',
),
"vi-VN": (
'a', 'ə', 'ɛ', 'e', 'i', 'o', 'ɔ', 'u', 'ɨ',
'b', 'c', 'z', 'j', 'd', 'g', 'h', 'x', 'l',
'm', 'n', 'ŋ', 'ɲ', 'p', 'f', 'w', 'r', 's',
'ʃ', 't', 'ʈ', 'ʂ', 'v', 'ʔ', 'ɓ', 'ɗ', 'ɣ',
'k', 'ʰ', 'ʷ', 'ɕ', 'ʑ', 'ʝ', '̚', '̟', 't͡',
'˧', 'ː', 'ɯ', '̀', '̄', '̌', '̂', 'ˀ', '͡', '˥',
'˩', '̤', '˨', 'ɹ', 'ʲ', '̯', 'ă', 'ə̆', 'ǐ',
'˦', 'æ', 'ɐ',
'ɜ', 'ɡ', 'ɪ', 'ɬ' 'ɾ', 'ʊ', 'ʌ', 'ʒ', '̃',
'̩', 'θ', 'ᵻ',
),
}

GRAPHEME_CHARACTER_CASES = ["upper", "lower", "mixed"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"english_text_preprocessing",
"any_locale_text_preprocessing",
"spanish_text_preprocessing",
"vietnamese_text_preprocessing",
"italian_text_preprocessing",
"any_locale_word_tokenize",
"english_word_tokenize",
Expand Down Expand Up @@ -201,3 +202,7 @@ def chinese_text_preprocessing(text: str) -> str:

def french_text_preprocessing(text: str) -> str:
return text.lower()


def vietnamese_text_preprocessing(text: str) -> str:
return text.lower()
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
french_text_preprocessing,
italian_text_preprocessing,
spanish_text_preprocessing,
vietnamese_text_preprocessing,
)
from nemo.utils import logging
from nemo.utils.decorators import experimental
Expand Down Expand Up @@ -202,6 +203,43 @@ def __init__(
)


class VietnameseCharsTokenizer(BaseCharsTokenizer):

_LOCALE = "vi-VN"
_CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")

def __init__(
self,
chars=_CHARSET_STR,
punct=True,
apostrophe=True,
add_blank_at=None,
pad_with_space=False,
non_default_punct_list=None,
text_preprocessing_func=vietnamese_text_preprocessing,
):
"""Vietnamese grapheme tokenizer.
Args:
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. By default, it
would keep any word lowercase.
"""
super().__init__(
chars=chars,
punct=punct,
apostrophe=apostrophe,
add_blank_at=add_blank_at,
pad_with_space=pad_with_space,
non_default_punct_list=non_default_punct_list,
text_preprocessing_func=vietnamese_text_preprocessing,
)


class GermanCharsTokenizer(BaseCharsTokenizer):

_LOCALE = "de-DE"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
IPATokenizer,
ItalianCharsTokenizer,
SpanishCharsTokenizer,
VietnameseCharsTokenizer,
)
from nemo.collections.tts.g2p.models.i18n_ipa import IpaG2p

Expand Down Expand Up @@ -124,6 +125,18 @@ def test_spanish_chars_tokenizer(self):
assert chars == expected_output
assert len(tokens) == len(input_text)

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_vietnamese_chars_tokenizer(self):
input_text = "Xin chào các bạn."
expected_output = "xin chào các bạn."

tokenizer = VietnameseCharsTokenizer()
chars, tokens = self._parse_text(tokenizer, input_text)

assert chars == expected_output
assert len(tokens) == len(input_text)

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_french_chars_tokenizer(self):
Expand Down

0 comments on commit 9097031

Please sign in to comment.