diff --git a/.github/workflows/build-python.yml b/.github/workflows/build-python.yml new file mode 100644 index 0000000..0caf69b --- /dev/null +++ b/.github/workflows/build-python.yml @@ -0,0 +1,192 @@ +name: Build Python + +on: + push: + branches: + - main + paths: + - '.github/workflows/build-python.yml' + - 'wrappers/**' + - 'src/**' + - 'include/**' + - 'example/**' + - 'CMakeLists.txt' + +jobs: + build-linux: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Install CMake + run: sudo apt-get update && sudo apt-get install -y cmake + + - name: Determine CPU Cores + id: cpu-info + run: echo "CPU_CORES=$(nproc)" >> $GITHUB_ENV + + - name: Configure CMake + run: cmake -B build -DCMAKE_BUILD_TYPE=Release + + - name: Build + run: cmake --build build --config Release -- -j${{ env.CPU_CORES }} + + - name: Upload Build Artifacts + uses: actions/upload-artifact@v3 + with: + name: linux-x86_64 + path: lib/ + + build-macos: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-13, macos-14] + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Install Homebrew + run: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" + + - name: Update CMake + run: brew install cmake + + - name: Determine CPU Cores + id: cpu-info + run: echo "CPU_CORES=$(sysctl -n hw.ncpu)" >> $GITHUB_ENV + + - name: Configure CMake + run: cmake -B build -DCMAKE_BUILD_TYPE=Release + + - name: Build + run: cmake --build build --config Release -- -j${{ env.CPU_CORES }} + + - name: Upload Build Artifacts + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.os == 'macos-13' && 'macos-x86_64' || 'macos-arm64' }} + path: lib/ + + create-universal-dylibs: + needs: build-macos + runs-on: macos-latest + steps: + - name: Download x86_64 Build Artifacts + uses: actions/download-artifact@v3 + with: + name: macos-x86_64 + path: macos-x86_64 + + - name: Download arm64 Build Artifacts + uses: actions/download-artifact@v3 + with: + name: macos-arm64 + path: macos-arm64 + + - name: Create Universal dylibs + run: | + mkdir -p universal/lib + for dylib in macos-x86_64/*.dylib; do + dylib_name=$(basename $dylib) + lipo -create macos-x86_64/$dylib_name macos-arm64/$dylib_name -output universal/$dylib_name + done + + - name: Upload Universal dylibs + uses: actions/upload-artifact@v3 + with: + name: macos-universal + path: universal/ + + build-windows: + runs-on: windows-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Install CMake + run: choco install -y cmake + + - name: Configure CMake + run: cmake -B build -DCMAKE_BUILD_TYPE=Release + + - name: Build + run: cmake --build build --config Release -- /m:4 + + - name: Copy Everything from \build\Release\ to \lib + run: xcopy /E /Y build\Release\ lib\ + + - name: Upload Build Artifacts + uses: actions/upload-artifact@v3 + with: + name: windows-x86_64 + path: lib/ + + build-python: + needs: + - build-linux + - build-windows + - create-universal-dylibs + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Download Linux Build Artifacts + uses: actions/download-artifact@v3 + with: + name: linux-x86_64 + path: linux + + - name: Download Windows Build Artifacts + uses: actions/download-artifact@v3 + with: + name: windows-x86_64 + path: windows + + - name: Download macOS Universal Build Artifacts + uses: actions/download-artifact@v3 + with: + name: macos-universal + path: macos + + - name: Create Build Directories + run: | + mkdir -p build/linux + mkdir -p build/windows + mkdir -p build/macos + mkdir -p build/models + + - name: Copy Linux Build Artifacts + run: cp -r linux/* build/linux + + - name: Copy Windows Build Artifacts + run: cp -r windows/* build/windows + + - name: Copy macOS Universal Build Artifacts + run: cp -r macos/* build/macos + + - name: Copy Models + run: cp -r models/* build/models + + - name: Copy Python Wrapper + run: cp wrappers/babylon.py build/__init__.py + + - name: Upload Build Artifacts + uses: actions/upload-artifact@v3 + with: + name: python + path: build/ diff --git a/example/main.cpp b/example/main.cpp index 6d2ef33..ff137d0 100644 --- a/example/main.cpp +++ b/example/main.cpp @@ -13,13 +13,25 @@ int main(int argc, char** argv) { text = argv[1]; - DeepPhonemizer::Session dp(dp_model_path); + DeepPhonemizer::Session dp(dp_model_path, "en_us", true); Vits::Session vits(vits_model_path); std::vector phonemes = dp.g2p(text); + for (const auto& phoneme : phonemes) { + std::cout << phoneme << " "; + } + std::cout << std::endl; + vits.tts(phonemes, "./babylon_output.wav"); + std::vector phoneme_ids = dp.g2p_tokens(text); + + for (const auto& id : phoneme_ids) { + std::cout << id << " "; + } + std::cout << std::endl; + return 0; } \ No newline at end of file diff --git a/include/babylon.h b/include/babylon.h index 02f2169..0114de0 100644 --- a/include/babylon.h +++ b/include/babylon.h @@ -15,6 +15,8 @@ BABYLON_EXPORT int babylon_g2p_init(const char* model_path, const char* language BABYLON_EXPORT char* babylon_g2p(const char* text); +BABYLON_EXPORT int* babylon_g2p_tokens(const char* text); + BABYLON_EXPORT void babylon_g2p_free(void); BABYLON_EXPORT int babylon_tts_init(const char* model_path); diff --git a/include/babylon.hpp b/include/babylon.hpp index a0e384d..1e6298d 100644 --- a/include/babylon.hpp +++ b/include/babylon.hpp @@ -12,11 +12,12 @@ namespace DeepPhonemizer { public: SequenceTokenizer(const std::vector& symbols, const std::vector& languages, int char_repeats, bool lowercase = true, bool append_start_end = true); std::vector operator()(const std::string& sentence, const std::string& language) const; - std::vector decode(const std::vector& sequence, bool remove_special_tokens = false) const; + std::vector decode(const std::vector& sequence) const; + std::vector clean(const std::vector& sequence) const; + int64_t get_token(const std::string& token) const; private: - std::unordered_map token_to_idx; - std::unordered_map idx_to_token; + std::vector tokens; int char_repeats; bool lowercase; bool append_start_end; @@ -25,9 +26,6 @@ namespace DeepPhonemizer { std::string pad_token; std::string end_token; std::unordered_set special_tokens; - - int get_start_index(const std::string& language) const; - std::string make_start_token(const std::string& language) const; }; class Session { @@ -36,19 +34,16 @@ namespace DeepPhonemizer { ~Session(); std::vector g2p(const std::string& text); + std::vector g2p_tokens(const std::string& text); private: - const std::array input_names = {"text"}; - const std::array output_names = {"output"}; - std::string lang; bool punctuation; Ort::Session* session; SequenceTokenizer* text_tokenizer; SequenceTokenizer* phoneme_tokenizer; - std::unordered_map> dictionary; - std::vector g2p_internal(const std::string& text); + std::vector g2p_tokens_internal(const std::string& text); }; std::vector clean_text(const std::string& text); @@ -72,9 +67,6 @@ namespace Vits { void tts(const std::vector& phonemes, const std::string& output_path); private: - const std::array input_names = {"input", "input_lengths", "scales"}; - const std::array output_names = {"output"}; - int sample_rate; std::vector scales; diff --git a/models/deep_phonemizer.onnx b/models/deep_phonemizer.onnx index 1da09d0..225b968 100644 Binary files a/models/deep_phonemizer.onnx and b/models/deep_phonemizer.onnx differ diff --git a/scripts/deep_phonemizer/dp_export.py b/scripts/deep_phonemizer/dp_export.py index b2ca937..08618b6 100644 --- a/scripts/deep_phonemizer/dp_export.py +++ b/scripts/deep_phonemizer/dp_export.py @@ -1,18 +1,7 @@ import torch import onnx -from typing import Dict from dp.model.model import AutoregressiveTransformer, ForwardTransformer, load_checkpoint -# Load and process the dictionary file -def load_and_process_dictionary(file_path: str) -> str: - with open(file_path, 'r') as file: - lines = file.readlines() - processed_lines = [] - for line in lines: - word, phonemes = line.strip().split(maxsplit=1) - processed_lines.append(f"{word.lower()}\t{phonemes}") - return "\n".join(processed_lines) - # Load your model checkpoint checkpoint_path = './en_us_cmudict_ipa_forward.pt' model, config = load_checkpoint(checkpoint_path) @@ -82,18 +71,13 @@ def forward(self, text, phonemes=None, start_index=None): # Verify the ONNX model onnx_model = onnx.load(onnx_file_path) -# Load and process dictionary file -dictionary_path = 'babylon_dict.txt' -processed_dictionary = load_and_process_dictionary(dictionary_path) - # Add metadata to the ONNX model metadata = { "languages": "de en_us", "text_symbols": "a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ä ö ü Ä Ö Ü ß", - "phoneme_symbols": "a b d e f g h i j k l m n o p r s t u v w x y z æ ç ð ø ŋ œ ɐ ɑ ɔ ə ɛ ɜ ɹ ɡ ɪ ʁ ʃ ʊ ʌ ʏ ʒ ʔ ' ˌ ː ̃ ̍ ̥ ̩ ̯ ͡ θ", + "phoneme_symbols": "a b d e f g h i j k l m n o p r s t u v w x y z æ ç ð ø ŋ œ ɐ ɑ ɔ ə ɛ ɜ ɹ ɡ ɪ ʁ ʃ ʊ ʌ ʏ ʒ ʔ ' ˌ ː ̃ ̍ ̥ ̩ ̯ ͡ θ . , : ; ? ! \" ( ) -", "char_repeats": "3" if isinstance(model, ForwardTransformer) else "1", - "lowercase": "1", - "dictionary": processed_dictionary + "lowercase": "1" } for key, value in metadata.items(): diff --git a/src/babylon.cpp b/src/babylon.cpp index ccdb545..16bbc7d 100644 --- a/src/babylon.cpp +++ b/src/babylon.cpp @@ -37,6 +37,30 @@ extern "C" { return strdup(phonemes.c_str()); } + BABYLON_EXPORT int* babylon_g2p_tokens(const char* text) { + if (dp == nullptr) { + std::cerr << "DeepPhonemizer session not initialized." << std::endl; + return nullptr; + } + + std::vector phoneme_ids; + try { + phoneme_ids = dp->g2p_tokens(text); + } + catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + } + + phoneme_ids.push_back(-1); // Sentinel value + + int* phoneme_ids_arr = new int[phoneme_ids.size()]; + for (size_t i = 0; i < phoneme_ids.size(); i++) { + phoneme_ids_arr[i] = phoneme_ids[i]; + } + + return phoneme_ids_arr; + } + BABYLON_EXPORT void babylon_g2p_free(void) { delete dp; } diff --git a/src/cleaners.cpp b/src/cleaners.cpp index efc7119..9adc875 100644 --- a/src/cleaners.cpp +++ b/src/cleaners.cpp @@ -3,6 +3,29 @@ #include #include #include +#include + +std::unordered_map abbreviations = { + {"mrs", "misess"}, + {"mr", "mister"}, + {"dr", "doctor"}, + {"st", "saint"}, + {"co", "company"}, + {"jr", "junior"}, + {"maj", "major"}, + {"gen", "general"}, + {"drs", "doctors"}, + {"rev", "reverend"}, + {"lt", "lieutenant"}, + {"hon", "honorable"}, + {"sgt", "sergeant"}, + {"capt", "captain"}, + {"esq", "esquire"}, + {"ltd", "limited"}, + {"col", "colonel"}, + {"ft", "foot"}, + {"pty", "proprietary"} +}; std::vector split_into_threes(const std::string& str) { std::vector parts; @@ -177,15 +200,14 @@ namespace DeepPhonemizer { std::vector number_words = numbers_to_words(cleaned_word); words.insert(words.end(), number_words.begin(), number_words.end()); } + else if (abbreviations.find(word) != abbreviations.end()) { + words.push_back(abbreviations[word]); + } else { words.push_back(word); } } - if (!word.empty()) { - words.push_back(word); - } - return words; } } \ No newline at end of file diff --git a/src/phonemizer.cpp b/src/phonemizer.cpp index 955d9ca..1c1de5d 100644 --- a/src/phonemizer.cpp +++ b/src/phonemizer.cpp @@ -5,29 +5,43 @@ #include #include +const std::array input_names = {"text"}; +const std::array output_names = {"output"}; + +std::vector softmax(const std::vector& logits) { + float max_logit = *std::max_element(logits.begin(), logits.end()); + std::vector probabilities(logits.size()); + + float sum = 0.0f; + for (float logit : logits) { + sum += std::exp(logit - max_logit); + } + + for (size_t i = 0; i < logits.size(); ++i) { + probabilities[i] = std::exp(logits[i] - max_logit) / sum; + } + + return probabilities; +} + namespace DeepPhonemizer { SequenceTokenizer::SequenceTokenizer(const std::vector& symbols, const std::vector& languages, int char_repeats, bool lowercase, bool append_start_end) - : char_repeats(char_repeats), lowercase(lowercase), append_start_end(append_start_end), pad_token("_"), end_token("") { + : char_repeats(char_repeats), lowercase(lowercase), append_start_end(append_start_end), pad_token(" "), end_token("") { - pad_index = 0; - token_to_idx[pad_token] = pad_index; + tokens.push_back(pad_token); special_tokens.insert(pad_token); for (const auto& lang : languages) { - std::string lang_token = make_start_token(lang); - token_to_idx[lang_token] = token_to_idx.size(); + std::string lang_token = "<" + lang + ">"; + tokens.push_back(lang_token); special_tokens.insert(lang_token); } - token_to_idx[end_token] = token_to_idx.size(); - end_index = token_to_idx[end_token]; + tokens.push_back(end_token); + end_index = tokens.size() - 1; for (const auto& symbol : symbols) { - token_to_idx[symbol] = token_to_idx.size(); - } - - for (const auto& pair : token_to_idx) { - idx_to_token[pair.second] = pair.first; + tokens.push_back(symbol); } } @@ -40,16 +54,17 @@ namespace DeepPhonemizer { std::vector sequence; for (char c : processed_sentence) { std::string symbol(1, c); - auto it = token_to_idx.find(symbol); - if (it != token_to_idx.end()) { + auto index = get_token(symbol); + if (index != -1) { for (int i = 0; i < char_repeats; ++i) { - sequence.push_back(it->second); + sequence.push_back(index); } } } if (append_start_end) { - sequence.insert(sequence.begin(), get_start_index(language)); + auto index = get_token("<" + language + ">"); + sequence.insert(sequence.begin(), index); sequence.push_back(end_index); } @@ -66,67 +81,63 @@ namespace DeepPhonemizer { return sequence; } - std::vector SequenceTokenizer::decode(const std::vector& sequence, bool remove_special_tokens) const { - std::vector pruned_sequence = sequence; - pruned_sequence.erase( - std::remove(pruned_sequence.begin(), pruned_sequence.end(), pad_index), - pruned_sequence.end() - ); - + std::vector SequenceTokenizer::decode(const std::vector& sequence) const { std::vector processed_sequence; if (append_start_end) { - processed_sequence.push_back(pruned_sequence.front()); - for (size_t i = 1; i < pruned_sequence.size() - 1; i += char_repeats) { - processed_sequence.push_back(pruned_sequence[i]); + processed_sequence.push_back(sequence.front()); + for (size_t i = 1; i < sequence.size() - 1; i += char_repeats) { + processed_sequence.push_back(sequence[i]); } - processed_sequence.push_back(pruned_sequence.back()); + processed_sequence.push_back(sequence.back()); } else { - for (size_t i = 0; i < pruned_sequence.size(); i += char_repeats) { - processed_sequence.push_back(pruned_sequence[i]); + for (size_t i = 0; i < sequence.size(); i += char_repeats) { + processed_sequence.push_back(sequence[i]); } } - // Remove consecutive duplicate tokens - auto last = std::unique(processed_sequence.begin(), processed_sequence.end()); - processed_sequence.erase(last, processed_sequence.end()); - std::vector decoded; for (int64_t token : processed_sequence) { if (token == end_index) { break; } - if (remove_special_tokens && special_tokens.count(idx_to_token.at(token))) { - continue; - } - decoded.push_back(idx_to_token.at(token)); + decoded.push_back(tokens[token]); } return decoded; } - int SequenceTokenizer::get_start_index(const std::string& language) const { - std::string lang_token = make_start_token(language); - return token_to_idx.at(lang_token); - } + std::vector SequenceTokenizer::clean(const std::vector& sequence) const { + std::vector processed_sequence = sequence; - std::string SequenceTokenizer::make_start_token(const std::string& language) const { - return "<" + language + ">"; - } + // remove all special tokens from the sequence + for (auto token : special_tokens) { + auto special_token_index = get_token(token); + if (special_token_index != -1) { + processed_sequence.erase(std::remove(processed_sequence.begin(), processed_sequence.end(), special_token_index), processed_sequence.end()); + } + } + + // extract everything between the start and end tokens + auto end = std::find(processed_sequence.begin(), processed_sequence.end(), end_index); + if (end != processed_sequence.end()) { + processed_sequence.erase(end, processed_sequence.end()); + } - std::vector softmax(const std::vector& logits) { - float max_logit = *std::max_element(logits.begin(), logits.end()); - std::vector probabilities(logits.size()); + // Remove consecutive duplicate tokens + auto last = std::unique(processed_sequence.begin(), processed_sequence.end()); + processed_sequence.erase(last, processed_sequence.end()); + + return processed_sequence; + } - float sum = 0.0f; - for (float logit : logits) { - sum += std::exp(logit - max_logit); - } + int64_t SequenceTokenizer::get_token(const std::string& token) const { + auto it = std::find(tokens.begin(), tokens.end(), token); - for (size_t i = 0; i < logits.size(); ++i) { - probabilities[i] = std::exp(logits[i] - max_logit) / sum; + if (it != tokens.end()) { + return std::distance(tokens.begin(), it); } - return probabilities; + return -1; } Session::Session(const std::string& model_path, const std::string language, const bool use_punctuation) { @@ -170,24 +181,6 @@ namespace DeepPhonemizer { phoneme_symbols.push_back(phoneme_symbol_buffer); } - std::string dictonary_str = model_metadata.LookupCustomMetadataMapAllocated("dictionary", allocator).get(); - - std::istringstream dictionary_stream(dictonary_str); - std::string line; - while (std::getline(dictionary_stream, line)) { - std::stringstream line_stream(line); - std::string word; - line_stream >> word; - - std::vector phonemes; - std::string phoneme; - while (line_stream >> phoneme) { - phonemes.push_back(phoneme); - } - - dictionary[word] = phonemes; - } - int char_repeats = model_metadata.LookupCustomMetadataMapAllocated("char_repeats", allocator).get()[0] - '0'; bool lowercase = model_metadata.LookupCustomMetadataMapAllocated("lowercase", allocator).get()[0] == '1'; @@ -209,41 +202,48 @@ namespace DeepPhonemizer { } std::vector Session::g2p(const std::string& text) { + // Convert input text to phonemes + std::vector phoneme_tokens = g2p_tokens(text); + + // Decode the phoneme tokens + return phoneme_tokenizer->decode(phoneme_tokens); + } + + std::vector Session::g2p_tokens(const std::string& text) { // Clean the input text std::vector words = clean_text(text); // Convert each word to phonemes - std::vector phonemes; + std::vector phoneme_ids; for (const auto& word : words) { - std::vector word_phonemes = g2p_internal(word); + std::vector word_phoneme_ids = g2p_tokens_internal(word); + + std::vector cleaned_word_phoneme_ids = phoneme_tokenizer->clean(word_phoneme_ids); - phonemes.insert(phonemes.end(), word_phonemes.begin(), word_phonemes.end()); + phoneme_ids.insert(phoneme_ids.end(), cleaned_word_phoneme_ids.begin(), cleaned_word_phoneme_ids.end()); if (punctuation) { + auto back_token = phoneme_tokenizer->get_token(std::string(1, word.back())); + // Check if the word ends with punctuation - if (std::ispunct(word.back())) { - phonemes.push_back(std::string(1, word.back())); + if (std::ispunct(word.back()) && back_token != -1) { + phoneme_ids.push_back(back_token); } } - phonemes.push_back(" "); + phoneme_ids.push_back(0); } - return phonemes; + return phoneme_ids; } - std::vector Session::g2p_internal(const std::string& text) { + std::vector Session::g2p_tokens_internal(const std::string& text) { // Check if the input text is longer than one character std::string key_text = text; std::transform(key_text.begin(), key_text.end(), key_text.begin(), ::tolower); key_text.erase(std::remove_if(key_text.begin(), key_text.end(), ::ispunct), key_text.end()); - // First check if word is in the dictionary - if (dictionary.count(key_text)) { - return dictionary.at(key_text); - } - // Convert input text to tensor std::vector input_tensors; std::vector input_ids = text_tokenizer->operator()(text, lang); @@ -294,9 +294,6 @@ namespace DeepPhonemizer { output_ids_vector[i] = std::distance(probabilities.begin(), max_prob_iter); } - // Convert output IDs to phonemes - std::vector phonemes = phoneme_tokenizer->decode(output_ids_vector, true); - - return phonemes; + return output_ids_vector; } } \ No newline at end of file diff --git a/src/voice.cpp b/src/voice.cpp index 616e654..ef50190 100644 --- a/src/voice.cpp +++ b/src/voice.cpp @@ -7,6 +7,9 @@ #include #include +const std::array input_names = {"input", "input_lengths", "scales"}; +const std::array output_names = {"output"}; + struct WavHeader { uint8_t RIFF[4] = {'R', 'I', 'F', 'F'}; uint32_t chunk_size; @@ -46,13 +49,6 @@ namespace Vits { for (const auto& phoneme : phonemes) { try { int64_t id = token_to_idx.at(phoneme); - - // This is to handle the subtle difference between deep_phonemizer and espeak-ng - if (id == 27 || id == 62) { - phoneme_ids.push_back(120); - phoneme_ids.push_back(0); - } - phoneme_ids.push_back(id); phoneme_ids.push_back(0); } diff --git a/wrappers/babylon.py b/wrappers/babylon.py index d395c0d..b3b0aeb 100644 --- a/wrappers/babylon.py +++ b/wrappers/babylon.py @@ -1,13 +1,17 @@ import ctypes import os +current_dir = os.path.dirname(os.path.abspath(__file__)) + +symbols = " abdefghijklmnoprstuvwxyzæçðøŋœɐɑɔəɛɜɹɡɪʁʃʊʌʏʒʔ'ˌː ̃ ̍ ̥ ̩ ̯ ͡θ.,:;?!\"()-" + # Load the shared library if os.name == 'nt': # Windows - babylon_lib = ctypes.CDLL('libbabylon.dll') + babylon_lib = ctypes.CDLL(os.path.join(current_dir, 'windows', 'libbabylon.dll')) elif os.name == 'posix': # macOS - babylon_lib = ctypes.CDLL('./libbabylon.dylib') + babylon_lib = ctypes.CDLL(os.path.join(current_dir, 'macos', 'libbabylon.dylib')) else: # Linux/Unix - babylon_lib = ctypes.CDLL('./babylon.so') + babylon_lib = ctypes.CDLL(os.path.join(current_dir, 'linux', 'libbabylon.so')) # Define the function prototypes babylon_lib.babylon_g2p_init.argtypes = [ctypes.c_char_p, ctypes.c_char_p, ctypes.c_int] @@ -16,6 +20,9 @@ babylon_lib.babylon_g2p.argtypes = [ctypes.c_char_p] babylon_lib.babylon_g2p.restype = ctypes.c_char_p +babylon_lib.babylon_g2p_tokens.argtypes = [ctypes.c_char_p] +babylon_lib.babylon_g2p_tokens.restype = ctypes.POINTER(ctypes.c_int) + babylon_lib.babylon_g2p_free.argtypes = [] babylon_lib.babylon_g2p_free.restype = None @@ -37,6 +44,19 @@ def g2p(text): result = babylon_lib.babylon_g2p(text.encode('utf-8')) return result.decode('utf-8') +# Use G2P with tokens +def g2p_tokens(text): + result_ptr = babylon_lib.babylon_g2p_tokens(text.encode('utf-8')) + + # Convert the pointer to a Python list, stopping at -1 + tokens = [] + i = 0 + while result_ptr[i] != -1: + tokens.append(result_ptr[i]) + i += 1 + + return tokens + # Free G2P resources def free_g2p(): babylon_lib.babylon_g2p_free() @@ -55,8 +75,8 @@ def free_tts(): # Example usage if __name__ == '__main__': - g2p_model_path = '../models/deep_phonemizer.onnx' - tts_model_path = '../models/curie.onnx' + g2p_model_path = os.path.join(current_dir, "models", "deep_phonemizer.onnx") + tts_model_path = os.path.join(current_dir, "models", "curie.onnx") language = 'en_us' use_punctuation = 1 sequence = 'Hello world, This is a python test of babylon' @@ -65,6 +85,9 @@ def free_tts(): print('G2P initialized successfully') phonemes = g2p(sequence) print(f'Phonemes: {phonemes}') + + tokens = g2p_tokens(sequence) + print(f'Tokens: {tokens}') else: print('Failed to initialize G2P')