Skip to content

Commit

Permalink
add ItalianPhonemesTokenizer (NVIDIA#7587)
Browse files Browse the repository at this point in the history
* add ItalianPhonemesTokenizer

Signed-off-by: GiacomoLeoneMaria <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix Italian phonemes

Signed-off-by: GiacomoLeoneMaria <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test

Signed-off-by: GiacomoLeoneMaria <[email protected]>

---------

Signed-off-by: GiacomoLeoneMaria <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <[email protected]>
Signed-off-by: Sasha Meister <[email protected]>
  • Loading branch information
3 people authored and ssh-meister committed Oct 10, 2023
1 parent 8f306a2 commit 2f6fa29
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
'ɢ','ʛ','ɦ','ɧ','ħ','ɥ','ʜ','ɨ','ɬ','ɫ','ɮ','ʟ',
'ɱ','ɯ','ɰ','ɳ','ɵ','ɸ','œ','ɶ','ʘ','ɺ','ɻ','ʀ','ʁ',
'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ',
'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ'
'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː',
),
}

Expand Down Expand Up @@ -181,7 +181,10 @@ def get_ipa_punctuation_list(locale):
'↑',
'→',
'↗',
'↘,',
'↘',
'”',
'’',
'-',
]
)
elif locale == "es-ES":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def __init__(
non_default_punct_list: List of punctuation marks which will be used instead default.
"""

it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòù"
it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòùó"
super().__init__(
chars=it_alphabet,
punct=punct,
Expand Down Expand Up @@ -367,6 +367,77 @@ def encode(self, text):
return [self._token2id[p] for p in cs]


class ItalianPhonemesTokenizer(BaseCharsTokenizer):
# fmt: off
PUNCT_LIST = (
',', '.', '!', '?', '-',
':', ';', '/', '"', '(',
')', '[', ']', '{', '}',
'„', '“', '”', '‘', '’', '‒', '—', '«', '»', '‹', '›', '_',
)
# fmt: on

def __init__(
self,
punct=True,
apostrophe=True,
add_blank_at=None,
pad_with_space=False,
non_default_punct_list=None,
text_preprocessing_func=italian_text_preprocessing,
):
"""Italian phoneme-based tokenizer.
Args:
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
Currently, it only applies lower() function.
"""

it_ipa = "abcdefghijklmnopqrstuvwxyzàèéìòùóæɐɑɔəɚɜɬɹʌʔᵻðŋɛɡɣɪɲɾʃʊʎʒʝβθd͡'t͡'øɒɕɓçɖɘɝɞɟʄɡɠɢʛɦɧħɥʜɨɬɫɮʟɱɯɰɳɵɸœɶʘɺɻʀʁɽʂʈʧʉʋⱱɤʍχʏʑʐʔʡʕʢǀǁǂᵻʃ'ː"
super().__init__(
chars=it_ipa,
punct=punct,
apostrophe=apostrophe,
add_blank_at=add_blank_at,
pad_with_space=pad_with_space,
non_default_punct_list=non_default_punct_list,
text_preprocessing_func=text_preprocessing_func,
)

def encode(self, text):
"""See base class."""
cs, space, tokens = [], self.tokens[self.space], set(self.tokens)

text = self.text_preprocessing_func(text)
for c in text:
# Add space if last one isn't one
if c == space and len(cs) > 0 and cs[-1] != space:
cs.append(c)
# Add next char
elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens:
cs.append(c)
# Add punct
elif (c in self.PUNCT_LIST) and self.punct:
cs.append(c)
# Warn about unknown char
elif c != space:
logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")

# Remove trailing spaces
while cs[-1] == space:
cs.pop()

if self.pad_with_space:
cs = [space] + cs + [space]

return [self._token2id[p] for p in cs]


class EnglishPhonemesTokenizer(BaseTokenizer):
# fmt: off
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ class TestTTSTokenizers:
"BUENOS": ["bwˈenos"],
"DÍAS": ["dˈias"],
}
PHONEME_DICT_IT = {
"CIAO": ["tʃˈao"],
"MONDO": ["mˈondo"],
}

@staticmethod
def _parse_text(tokenizer, text):
Expand Down Expand Up @@ -146,6 +150,18 @@ def test_ipa_tokenizer_de_de(self):

assert chars == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_ipa_tokenizer_it_it(self):
input_text = "Ciao mondo"
expected_output = "tʃˈao mˈondo"

g2p = IpaG2p(phoneme_dict=self.PHONEME_DICT_IT, locale="it-IT")
tokenizer = IPATokenizer(g2p=g2p, locale="it-IT")
chars, tokens = self._parse_text(tokenizer, input_text)

assert chars == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_ipa_tokenizer_en_us(self):
Expand Down

0 comments on commit 2f6fa29

Please sign in to comment.