diff --git a/_script/export-checkpoints.sh b/_script/export-checkpoints.sh index 124eab3..07b571c 100755 --- a/_script/export-checkpoints.sh +++ b/_script/export-checkpoints.sh @@ -86,7 +86,8 @@ find "${piper_checkpoints}" -name '*.ckpt' | sort | \ if [ ! -s "${onnx}" ]; then # Export to onnx and optimize - python3 -m piper_train.export_onnx \ + PYTHONPATH="${repo_dir}/src/python" \ + python3 -m piper_train.export_onnx \ "${checkpoint}" \ "${onnx}.unoptimized"; diff --git a/_script/generate-samples.sh b/_script/generate-samples.sh new file mode 100755 index 0000000..126bc22 --- /dev/null +++ b/_script/generate-samples.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +set -eo pipefail + +# Generates audio samples for voices. +# Requires: ffmpeg jq + +if [ -z "$1" ]; then + echo 'Usage: generate-samples.sh ' + exit 1 +fi + +this_dir="$( cd "$( dirname "$0" )" && pwd )" +repo_dir="$(realpath "${this_dir}/../../")" + +venv="${repo_dir}/src/python/.venv" +if [ -d "${venv}" ]; then + source "${venv}/bin/activate" +fi + +# ----------------------------------------------------------------------------- + +piper_voices="$1" +piper_binary="${repo_dir}/install/piper" + +find "${piper_voices}" -name '*.onnx' | sort | \ + while read -r onnx; do + voice_dir="$(dirname "${onnx}")"; + quality="$(basename "${voice_dir}")" + dataset_dir="$(dirname "${voice_dir}")"; + dataset="$(basename "${dataset_dir}")" + language_dir="$(dirname "${dataset_dir}")"; + language="$(basename "${language_dir}")" + language_family_dir="$(dirname "${language_dir}")"; + language_family="$(basename "${language_family_dir}")" + + test_sentences="${repo_dir}/etc/test_sentences/${language_family}.txt" + if [ ! -s "${test_sentences}" ]; then + echo "[ERROR] Missing ${test_sentences}" >&2; + continue; + fi + + samples_dir="${voice_dir}/samples" + mkdir -p "${samples_dir}" + + num_speakers="$(jq --raw-output '.num_speakers' "${onnx}.json")" + sample_rate="$(jq --raw-output '.audio.sample_rate' "${onnx}.json")" + last_speaker_id="$((num_speakers-1))" + + # Generate a sample from the first test sentence for each speaker + for speaker_id in `seq 0 ${last_speaker_id}`; do + sample_mp3="${samples_dir}/speaker_${speaker_id}.mp3" + if [ -s "${sample_mp3}" ]; then + sample_mp3_size="$(stat --printf='%s' "${sample_mp3}")" + else + sample_mp3_size='0' + fi + + if [ "${sample_mp3_size}" -lt 1000 ]; then + echo "Generating sample for ${dataset} (quality=${quality}, speaker=${speaker_id})" + + # Compress to MP3 with ffmpeg + head -n1 "${test_sentences}" | \ + "${piper_binary}" --model "${onnx}" --speaker "${speaker_id}" --output_raw | \ + ffmpeg -hide_banner -loglevel warning -y \ + -sample_rate "${sample_rate}" -f s16le -ac 1 -i - \ + -codec:a libmp3lame -qscale:a 2 "${sample_mp3}"; + fi; + done + done diff --git a/samples/en/en_GB/cori/high/MODEL_CARD b/samples/en/en_GB/cori/high/MODEL_CARD new file mode 100644 index 0000000..6d70667 --- /dev/null +++ b/samples/en/en_GB/cori/high/MODEL_CARD @@ -0,0 +1,17 @@ +# Model card for cori (high) + +* Language: en_GB (English, Great Britain) +* Speakers: 1 +* Quality: medium +* Samplerate: 22,050Hz + +## Dataset + +* URL: https://librivox.org +* License: public domain + +## Training + +See: https://brycebeattie.com/files/tts/ + +UK English female voice. Single Speaker. Trained from scratch on high quality settings for 500 epochs. I put together the dataset, which ended up with about 24 hours of recordings. All recordings came from LibriVox.org. diff --git a/samples/en/en_GB/cori/high/sample.txt b/samples/en/en_GB/cori/high/sample.txt new file mode 100644 index 0000000..b0a2cf8 --- /dev/null +++ b/samples/en/en_GB/cori/high/sample.txt @@ -0,0 +1 @@ +A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky. diff --git a/samples/en/en_GB/cori/high/speaker_0.mp3 b/samples/en/en_GB/cori/high/speaker_0.mp3 new file mode 100644 index 0000000..610039b Binary files /dev/null and b/samples/en/en_GB/cori/high/speaker_0.mp3 differ diff --git a/samples/en/en_US/kristin/medium/MODEL_CARD b/samples/en/en_US/kristin/medium/MODEL_CARD new file mode 100644 index 0000000..40000a8 --- /dev/null +++ b/samples/en/en_US/kristin/medium/MODEL_CARD @@ -0,0 +1,17 @@ +# Model card for kristin (medium) + +* Language: en_US(English, United States) +* Speakers: 1 +* Quality: medium +* Samplerate: 22,050Hz + +## Dataset + +* URL: https://librivox.org +* License: public domain + +## Training + +See: https://brycebeattie.com/files/tts/ + +US English female voice. Single Speaker. Trained from scratch on medium quality settings for 2000 epochs. I put together the dataset, which ended up with about 11.5 hours of recordings. All recordings came from LibriVox.org. diff --git a/samples/en/en_US/kristin/medium/sample.txt b/samples/en/en_US/kristin/medium/sample.txt new file mode 100644 index 0000000..b0a2cf8 --- /dev/null +++ b/samples/en/en_US/kristin/medium/sample.txt @@ -0,0 +1 @@ +A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky. diff --git a/samples/en/en_US/kristin/medium/speaker_0.mp3 b/samples/en/en_US/kristin/medium/speaker_0.mp3 new file mode 100644 index 0000000..63b9485 Binary files /dev/null and b/samples/en/en_US/kristin/medium/speaker_0.mp3 differ diff --git a/samples/en/en_US/ljspeech/high/MODEL_CARD b/samples/en/en_US/ljspeech/high/MODEL_CARD new file mode 100644 index 0000000..a7c9acc --- /dev/null +++ b/samples/en/en_US/ljspeech/high/MODEL_CARD @@ -0,0 +1,17 @@ +# Model card for ljspeech (high) + +* Language: en_US (English, United States) +* Speakers: 1 +* Quality: medium +* Samplerate: 22,050Hz + +## Dataset + +* URL: https://keithito.com/LJ-Speech-Dataset/ +* License: public domain + +## Training + +See: https://brycebeattie.com/files/tts/ + +US English female voice. Single speaker. Trained from scratch for 1000 epochs on medium quality settings using the LJ Speech dataset. I reencoded the recordings to a bit rate of 22500 Hz so it would match other voices released for Piper TTS. diff --git a/samples/en/en_US/ljspeech/high/sample.txt b/samples/en/en_US/ljspeech/high/sample.txt new file mode 100644 index 0000000..b0a2cf8 --- /dev/null +++ b/samples/en/en_US/ljspeech/high/sample.txt @@ -0,0 +1 @@ +A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky. diff --git a/samples/en/en_US/ljspeech/high/speaker_0.mp3 b/samples/en/en_US/ljspeech/high/speaker_0.mp3 new file mode 100644 index 0000000..ae1260a Binary files /dev/null and b/samples/en/en_US/ljspeech/high/speaker_0.mp3 differ diff --git a/samples/en/en_US/ljspeech/medium/MODEL_CARD b/samples/en/en_US/ljspeech/medium/MODEL_CARD new file mode 100644 index 0000000..324d747 --- /dev/null +++ b/samples/en/en_US/ljspeech/medium/MODEL_CARD @@ -0,0 +1,17 @@ +# Model card for ljspeech (medium) + +* Language: en_US (English, United States) +* Speakers: 1 +* Quality: medium +* Samplerate: 22,050Hz + +## Dataset + +* URL: https://keithito.com/LJ-Speech-Dataset/ +* License: public domain + +## Training + +See: https://brycebeattie.com/files/tts/ + +US English female voice. Single speaker. Trained from scratch for 1000 epochs on medium quality settings using the LJ Speech dataset. I reencoded the recordings to a bit rate of 22500 Hz so it would match other voices released for Piper TTS. diff --git a/samples/en/en_US/ljspeech/medium/sample.txt b/samples/en/en_US/ljspeech/medium/sample.txt new file mode 100644 index 0000000..b0a2cf8 --- /dev/null +++ b/samples/en/en_US/ljspeech/medium/sample.txt @@ -0,0 +1 @@ +A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky. diff --git a/samples/en/en_US/ljspeech/medium/speaker_0.mp3 b/samples/en/en_US/ljspeech/medium/speaker_0.mp3 new file mode 100644 index 0000000..f97a898 Binary files /dev/null and b/samples/en/en_US/ljspeech/medium/speaker_0.mp3 differ diff --git a/samples/es/es_MX/claude/high/MODEL_CARD b/samples/es/es_MX/claude/high/MODEL_CARD new file mode 100644 index 0000000..6f5160c --- /dev/null +++ b/samples/es/es_MX/claude/high/MODEL_CARD @@ -0,0 +1,15 @@ +# Model card for claude (high) + +* Language: es_MX (Spanish, Mexico) +* Speakers: 1 +* Quality: high +* Samplerate: 22,050Hz + +## Dataset + +* URL: https://huggingface.co/spaces/HirCoir/Piper-TTS-Spanish +* License: apache-2.0 + +## Training + +See URL above diff --git a/samples/es/es_MX/claude/high/sample.txt b/samples/es/es_MX/claude/high/sample.txt new file mode 100644 index 0000000..3750a8c --- /dev/null +++ b/samples/es/es_MX/claude/high/sample.txt @@ -0,0 +1 @@ +Un arcoíris​ o arco iris es un fenómeno óptico y meteorológico que consiste en la aparición en el cielo de un arco de luz multicolor, originado por la descomposición de la luz solar en el espectro visible, la cual se produce por refracción, cuando los rayos del sol atraviesan pequeñas gotas de agua contenidas en la atmósfera terrestre. diff --git a/samples/es/es_MX/claude/high/speaker_0.mp3 b/samples/es/es_MX/claude/high/speaker_0.mp3 new file mode 100644 index 0000000..502d18e Binary files /dev/null and b/samples/es/es_MX/claude/high/speaker_0.mp3 differ diff --git a/voices.json b/voices.json index caf1afa..02d3316 100644 --- a/voices.json +++ b/voices.json @@ -968,6 +968,36 @@ }, "aliases": [] }, + "en_GB-cori-high": { + "key": "en_GB-cori-high", + "name": "cori", + "language": { + "code": "en_GB", + "family": "en", + "region": "GB", + "name_native": "English", + "name_english": "English", + "country_english": "Great Britain" + }, + "quality": "high", + "num_speakers": 1, + "speaker_id_map": {}, + "files": { + "en/en_GB/cori/high/en_GB-cori-high.onnx": { + "size_bytes": 114219352, + "md5_digest": "3474a80133d9a03e6870d2ac42c18806" + }, + "en/en_GB/cori/high/en_GB-cori-high.onnx.json": { + "size_bytes": 4963, + "md5_digest": "0f7d42e77a99193006aa34a34442f5e0" + }, + "en/en_GB/cori/high/MODEL_CARD": { + "size_bytes": 470, + "md5_digest": "93702d22a081e5efe612625f2a9d6d51" + } + }, + "aliases": [] + }, "en_GB-jenny_dioco-medium": { "key": "en_GB-jenny_dioco-medium", "name": "jenny_dioco", @@ -1498,6 +1528,36 @@ "en-us-kathleen-low" ] }, + "en_US-kristin-medium": { + "key": "en_US-kristin-medium", + "name": "kristin", + "language": { + "code": "en_US", + "family": "en", + "region": "US", + "name_native": "English", + "name_english": "English", + "country_english": "United States" + }, + "quality": "medium", + "num_speakers": 1, + "speaker_id_map": {}, + "files": { + "en/en_US/kristin/medium/en_US-kristin-medium.onnx": { + "size_bytes": 63531379, + "md5_digest": "5fed42d2296baca042e2bf74785db725" + }, + "en/en_US/kristin/medium/en_US-kristin-medium.onnx.json": { + "size_bytes": 4968, + "md5_digest": "70bc97d350c796c64ea5e4d08241afac" + }, + "en/en_US/kristin/medium/MODEL_CARD": { + "size_bytes": 479, + "md5_digest": "9bfb8192299b34cddf76455f04cb8cd2" + } + }, + "aliases": [] + }, "en_US-kusal-medium": { "key": "en_US-kusal-medium", "name": "kusal", @@ -3549,6 +3609,66 @@ }, "aliases": [] }, + "en_US-ljspeech-high": { + "key": "en_US-ljspeech-high", + "name": "ljspeech", + "language": { + "code": "en_US", + "family": "en", + "region": "US", + "name_native": "English", + "name_english": "English", + "country_english": "United States" + }, + "quality": "high", + "num_speakers": 1, + "speaker_id_map": {}, + "files": { + "en/en_US/ljspeech/high/en_US-ljspeech-high.onnx": { + "size_bytes": 114199011, + "md5_digest": "dad093b5d2cff6a5fda99883ceda09d1" + }, + "en/en_US/ljspeech/high/en_US-ljspeech-high.onnx.json": { + "size_bytes": 4967, + "md5_digest": "1f93986f7c39647383f2a0b1471f30aa" + }, + "en/en_US/ljspeech/high/MODEL_CARD": { + "size_bytes": 515, + "md5_digest": "59322a9a8d2c0e556f0be1171cd54ea7" + } + }, + "aliases": [] + }, + "en_US-ljspeech-medium": { + "key": "en_US-ljspeech-medium", + "name": "ljspeech", + "language": { + "code": "en_US", + "family": "en", + "region": "US", + "name_native": "English", + "name_english": "English", + "country_english": "United States" + }, + "quality": "medium", + "num_speakers": 1, + "speaker_id_map": {}, + "files": { + "en/en_US/ljspeech/medium/en_US-ljspeech-medium.onnx": { + "size_bytes": 63531379, + "md5_digest": "109d552e9dd78d92d1169a7edd6de38d" + }, + "en/en_US/ljspeech/medium/en_US-ljspeech-medium.onnx.json": { + "size_bytes": 4969, + "md5_digest": "ae02d7c3f01e9606dcac4668e58720c9" + }, + "en/en_US/ljspeech/medium/MODEL_CARD": { + "size_bytes": 517, + "md5_digest": "5ad31d314786587d4f97effb0b716d61" + } + }, + "aliases": [] + }, "en_US-ryan-high": { "key": "en_US-ryan-high", "name": "ryan", @@ -3834,6 +3954,36 @@ }, "aliases": [] }, + "es_MX-claude-high": { + "key": "es_MX-claude-high", + "name": "claude", + "language": { + "code": "es_MX", + "family": "es", + "region": "MX", + "name_native": "Español", + "name_english": "Spanish", + "country_english": "Mexico" + }, + "quality": "high", + "num_speakers": 1, + "speaker_id_map": {}, + "files": { + "es/es_MX/claude/high/es_MX-claude-high.onnx": { + "size_bytes": 63122309, + "md5_digest": "cb1966e0ff20ca3aa010f6c9a0ce296a" + }, + "es/es_MX/claude/high/es_MX-claude-high.onnx.json": { + "size_bytes": 4963, + "md5_digest": "36882201922fabb409f18d284af3eddd" + }, + "es/es_MX/claude/high/MODEL_CARD": { + "size_bytes": 247, + "md5_digest": "bca99dc9b9a6c8f2f8563d73d52c4a3b" + } + }, + "aliases": [] + }, "fa_IR-amir-medium": { "key": "fa_IR-amir-medium", "name": "amir",