Add cori, kristin, ljspeech, claude

rhasspy · Mar 12, 2024 · c9ecb8f · c9ecb8f
1 parent 857377c
commit c9ecb8f
Show file tree

Hide file tree

Showing 18 changed files with 309 additions and 1 deletion.
diff --git a/_script/export-checkpoints.sh b/_script/export-checkpoints.sh
@@ -86,7 +86,8 @@ find "${piper_checkpoints}" -name '*.ckpt' | sort | \
 
         if [ ! -s "${onnx}" ]; then
             # Export to onnx and optimize
-            python3 -m piper_train.export_onnx \
+            PYTHONPATH="${repo_dir}/src/python" \
+                python3 -m piper_train.export_onnx \
                     "${checkpoint}" \
                     "${onnx}.unoptimized";
 

diff --git a/_script/generate-samples.sh b/_script/generate-samples.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Generates audio samples for voices.
+# Requires: ffmpeg jq
+
+if [ -z "$1" ]; then
+    echo 'Usage: generate-samples.sh <piper-voices>'
+    exit 1
+fi
+
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+repo_dir="$(realpath "${this_dir}/../../")"
+
+venv="${repo_dir}/src/python/.venv"
+if [ -d "${venv}" ]; then
+    source "${venv}/bin/activate"
+fi
+
+# -----------------------------------------------------------------------------
+
+piper_voices="$1"
+piper_binary="${repo_dir}/install/piper"
+
+find "${piper_voices}" -name '*.onnx' | sort | \
+    while read -r onnx; do
+        voice_dir="$(dirname "${onnx}")";
+        quality="$(basename "${voice_dir}")"
+        dataset_dir="$(dirname "${voice_dir}")";
+        dataset="$(basename "${dataset_dir}")"
+        language_dir="$(dirname "${dataset_dir}")";
+        language="$(basename "${language_dir}")"
+        language_family_dir="$(dirname "${language_dir}")";
+        language_family="$(basename "${language_family_dir}")"
+
+        test_sentences="${repo_dir}/etc/test_sentences/${language_family}.txt"
+        if [ ! -s "${test_sentences}" ]; then
+            echo "[ERROR] Missing ${test_sentences}" >&2;
+            continue;
+        fi
+
+        samples_dir="${voice_dir}/samples"
+        mkdir -p "${samples_dir}"
+
+        num_speakers="$(jq --raw-output '.num_speakers' "${onnx}.json")"
+        sample_rate="$(jq --raw-output '.audio.sample_rate' "${onnx}.json")"
+        last_speaker_id="$((num_speakers-1))"
+
+        # Generate a sample from the first test sentence for each speaker
+        for speaker_id in `seq 0 ${last_speaker_id}`; do
+            sample_mp3="${samples_dir}/speaker_${speaker_id}.mp3"
+            if [ -s "${sample_mp3}" ]; then
+                sample_mp3_size="$(stat --printf='%s' "${sample_mp3}")"
+            else
+                sample_mp3_size='0'
+            fi
+
+            if [ "${sample_mp3_size}" -lt 1000 ]; then
+                echo "Generating sample for ${dataset} (quality=${quality}, speaker=${speaker_id})"
+
+                # Compress to MP3 with ffmpeg
+                head -n1 "${test_sentences}" | \
+                    "${piper_binary}" --model "${onnx}" --speaker "${speaker_id}" --output_raw | \
+                    ffmpeg -hide_banner -loglevel warning -y \
+                        -sample_rate "${sample_rate}" -f s16le -ac 1 -i - \
+                        -codec:a libmp3lame -qscale:a 2 "${sample_mp3}";
+            fi;
+        done
+    done
diff --git a/samples/en/en_GB/cori/high/MODEL_CARD b/samples/en/en_GB/cori/high/MODEL_CARD
@@ -0,0 +1,17 @@
+# Model card for cori (high)
+
+* Language: en_GB (English, Great Britain)
+* Speakers: 1
+* Quality: medium
+* Samplerate: 22,050Hz
+
+## Dataset
+
+* URL: https://librivox.org
+* License: public domain
+
+## Training
+
+See: https://brycebeattie.com/files/tts/
+
+UK English female voice. Single Speaker. Trained from scratch on high quality settings for 500 epochs. I put together the dataset, which ended up with about 24 hours of recordings. All recordings came from LibriVox.org.
diff --git a/samples/en/en_GB/cori/high/sample.txt b/samples/en/en_GB/cori/high/sample.txt
@@ -0,0 +1 @@
+A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.
diff --git a/samples/en/en_GB/cori/high/speaker_0.mp3 b/samples/en/en_GB/cori/high/speaker_0.mp3
diff --git a/samples/en/en_US/kristin/medium/MODEL_CARD b/samples/en/en_US/kristin/medium/MODEL_CARD
@@ -0,0 +1,17 @@
+# Model card for kristin (medium)
+
+* Language: en_US(English, United States)
+* Speakers: 1
+* Quality: medium
+* Samplerate: 22,050Hz
+
+## Dataset
+
+* URL: https://librivox.org
+* License: public domain
+
+## Training
+
+See: https://brycebeattie.com/files/tts/
+
+US English female voice. Single Speaker. Trained from scratch on medium quality settings for 2000 epochs. I put together the dataset, which ended up with about 11.5 hours of recordings. All recordings came from LibriVox.org.
diff --git a/samples/en/en_US/kristin/medium/sample.txt b/samples/en/en_US/kristin/medium/sample.txt
@@ -0,0 +1 @@
+A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.
diff --git a/samples/en/en_US/kristin/medium/speaker_0.mp3 b/samples/en/en_US/kristin/medium/speaker_0.mp3
diff --git a/samples/en/en_US/ljspeech/high/MODEL_CARD b/samples/en/en_US/ljspeech/high/MODEL_CARD
@@ -0,0 +1,17 @@
+# Model card for ljspeech (high)
+
+* Language: en_US (English, United States)
+* Speakers: 1
+* Quality: medium
+* Samplerate: 22,050Hz
+
+## Dataset
+
+* URL: https://keithito.com/LJ-Speech-Dataset/
+* License: public domain
+
+## Training
+
+See: https://brycebeattie.com/files/tts/
+
+US English female voice. Single speaker. Trained from scratch for 1000 epochs on medium quality settings using the LJ Speech dataset. I reencoded the recordings to a bit rate of 22500 Hz so it would match other voices released for Piper TTS.
diff --git a/samples/en/en_US/ljspeech/high/sample.txt b/samples/en/en_US/ljspeech/high/sample.txt
@@ -0,0 +1 @@
+A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.
diff --git a/samples/en/en_US/ljspeech/high/speaker_0.mp3 b/samples/en/en_US/ljspeech/high/speaker_0.mp3
diff --git a/samples/en/en_US/ljspeech/medium/MODEL_CARD b/samples/en/en_US/ljspeech/medium/MODEL_CARD
@@ -0,0 +1,17 @@
+# Model card for ljspeech (medium)
+
+* Language: en_US (English, United States)
+* Speakers: 1
+* Quality: medium
+* Samplerate: 22,050Hz
+
+## Dataset
+
+* URL: https://keithito.com/LJ-Speech-Dataset/
+* License: public domain
+
+## Training
+
+See: https://brycebeattie.com/files/tts/
+
+US English female voice. Single speaker. Trained from scratch for 1000 epochs on medium quality settings using the LJ Speech dataset. I reencoded the recordings to a bit rate of 22500 Hz so it would match other voices released for Piper TTS.
diff --git a/samples/en/en_US/ljspeech/medium/sample.txt b/samples/en/en_US/ljspeech/medium/sample.txt
@@ -0,0 +1 @@
+A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.
diff --git a/samples/en/en_US/ljspeech/medium/speaker_0.mp3 b/samples/en/en_US/ljspeech/medium/speaker_0.mp3
diff --git a/samples/es/es_MX/claude/high/MODEL_CARD b/samples/es/es_MX/claude/high/MODEL_CARD
@@ -0,0 +1,15 @@
+# Model card for claude (high)
+
+* Language: es_MX (Spanish, Mexico)
+* Speakers: 1
+* Quality: high
+* Samplerate: 22,050Hz
+
+## Dataset
+
+* URL: https://huggingface.co/spaces/HirCoir/Piper-TTS-Spanish
+* License: apache-2.0
+
+## Training
+
+See URL above
diff --git a/samples/es/es_MX/claude/high/sample.txt b/samples/es/es_MX/claude/high/sample.txt
@@ -0,0 +1 @@
+Un arcoíris o arco iris es un fenómeno óptico y meteorológico que consiste en la aparición en el cielo de un arco de luz multicolor, originado por la descomposición de la luz solar en el espectro visible, la cual se produce por refracción, cuando los rayos del sol atraviesan pequeñas gotas de agua contenidas en la atmósfera terrestre.
diff --git a/samples/es/es_MX/claude/high/speaker_0.mp3 b/samples/es/es_MX/claude/high/speaker_0.mp3
diff --git a/voices.json b/voices.json
@@ -968,6 +968,36 @@
         },
         "aliases": []
     },
+    "en_GB-cori-high": {
+        "key": "en_GB-cori-high",
+        "name": "cori",
+        "language": {
+            "code": "en_GB",
+            "family": "en",
+            "region": "GB",
+            "name_native": "English",
+            "name_english": "English",
+            "country_english": "Great Britain"
+        },
+        "quality": "high",
+        "num_speakers": 1,
+        "speaker_id_map": {},
+        "files": {
+            "en/en_GB/cori/high/en_GB-cori-high.onnx": {
+                "size_bytes": 114219352,
+                "md5_digest": "3474a80133d9a03e6870d2ac42c18806"
+            },
+            "en/en_GB/cori/high/en_GB-cori-high.onnx.json": {
+                "size_bytes": 4963,
+                "md5_digest": "0f7d42e77a99193006aa34a34442f5e0"
+            },
+            "en/en_GB/cori/high/MODEL_CARD": {
+                "size_bytes": 470,
+                "md5_digest": "93702d22a081e5efe612625f2a9d6d51"
+            }
+        },
+        "aliases": []
+    },
     "en_GB-jenny_dioco-medium": {
         "key": "en_GB-jenny_dioco-medium",
         "name": "jenny_dioco",
@@ -1498,6 +1528,36 @@
             "en-us-kathleen-low"
         ]
     },
+    "en_US-kristin-medium": {
+        "key": "en_US-kristin-medium",
+        "name": "kristin",
+        "language": {
+            "code": "en_US",
+            "family": "en",
+            "region": "US",
+            "name_native": "English",
+            "name_english": "English",
+            "country_english": "United States"
+        },
+        "quality": "medium",
+        "num_speakers": 1,
+        "speaker_id_map": {},
+        "files": {
+            "en/en_US/kristin/medium/en_US-kristin-medium.onnx": {
+                "size_bytes": 63531379,
+                "md5_digest": "5fed42d2296baca042e2bf74785db725"
+            },
+            "en/en_US/kristin/medium/en_US-kristin-medium.onnx.json": {
+                "size_bytes": 4968,
+                "md5_digest": "70bc97d350c796c64ea5e4d08241afac"
+            },
+            "en/en_US/kristin/medium/MODEL_CARD": {
+                "size_bytes": 479,
+                "md5_digest": "9bfb8192299b34cddf76455f04cb8cd2"
+            }
+        },
+        "aliases": []
+    },
     "en_US-kusal-medium": {
         "key": "en_US-kusal-medium",
         "name": "kusal",
@@ -3549,6 +3609,66 @@
         },
         "aliases": []
     },
+    "en_US-ljspeech-high": {
+        "key": "en_US-ljspeech-high",
+        "name": "ljspeech",
+        "language": {
+            "code": "en_US",
+            "family": "en",
+            "region": "US",
+            "name_native": "English",
+            "name_english": "English",
+            "country_english": "United States"
+        },
+        "quality": "high",
+        "num_speakers": 1,
+        "speaker_id_map": {},
+        "files": {
+            "en/en_US/ljspeech/high/en_US-ljspeech-high.onnx": {
+                "size_bytes": 114199011,
+                "md5_digest": "dad093b5d2cff6a5fda99883ceda09d1"
+            },
+            "en/en_US/ljspeech/high/en_US-ljspeech-high.onnx.json": {
+                "size_bytes": 4967,
+                "md5_digest": "1f93986f7c39647383f2a0b1471f30aa"
+            },
+            "en/en_US/ljspeech/high/MODEL_CARD": {
+                "size_bytes": 515,
+                "md5_digest": "59322a9a8d2c0e556f0be1171cd54ea7"
+            }
+        },
+        "aliases": []
+    },
+    "en_US-ljspeech-medium": {
+        "key": "en_US-ljspeech-medium",
+        "name": "ljspeech",
+        "language": {
+            "code": "en_US",
+            "family": "en",
+            "region": "US",
+            "name_native": "English",
+            "name_english": "English",
+            "country_english": "United States"
+        },
+        "quality": "medium",
+        "num_speakers": 1,
+        "speaker_id_map": {},
+        "files": {
+            "en/en_US/ljspeech/medium/en_US-ljspeech-medium.onnx": {
+                "size_bytes": 63531379,
+                "md5_digest": "109d552e9dd78d92d1169a7edd6de38d"
+            },
+            "en/en_US/ljspeech/medium/en_US-ljspeech-medium.onnx.json": {
+                "size_bytes": 4969,
+                "md5_digest": "ae02d7c3f01e9606dcac4668e58720c9"
+            },
+            "en/en_US/ljspeech/medium/MODEL_CARD": {
+                "size_bytes": 517,
+                "md5_digest": "5ad31d314786587d4f97effb0b716d61"
+            }
+        },
+        "aliases": []
+    },
     "en_US-ryan-high": {
         "key": "en_US-ryan-high",
         "name": "ryan",
@@ -3834,6 +3954,36 @@
         },
         "aliases": []
     },
+    "es_MX-claude-high": {
+        "key": "es_MX-claude-high",
+        "name": "claude",
+        "language": {
+            "code": "es_MX",
+            "family": "es",
+            "region": "MX",
+            "name_native": "Español",
+            "name_english": "Spanish",
+            "country_english": "Mexico"
+        },
+        "quality": "high",
+        "num_speakers": 1,
+        "speaker_id_map": {},
+        "files": {
+            "es/es_MX/claude/high/es_MX-claude-high.onnx": {
+                "size_bytes": 63122309,
+                "md5_digest": "cb1966e0ff20ca3aa010f6c9a0ce296a"
+            },
+            "es/es_MX/claude/high/es_MX-claude-high.onnx.json": {
+                "size_bytes": 4963,
+                "md5_digest": "36882201922fabb409f18d284af3eddd"
+            },
+            "es/es_MX/claude/high/MODEL_CARD": {
+                "size_bytes": 247,
+                "md5_digest": "bca99dc9b9a6c8f2f8563d73d52c4a3b"
+            }
+        },
+        "aliases": []
+    },
     "fa_IR-amir-medium": {
         "key": "fa_IR-amir-medium",
         "name": "amir",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Un arcoíris o arco iris es un fenómeno óptico y meteorológico que consiste en la aparición en el cielo de un arco de luz multicolor, originado por la descomposición de la luz solar en el espectro visible, la cual se produce por refracción, cuando los rayos del sol atraviesan pequeñas gotas de agua contenidas en la atmósfera terrestre.