feat:remote_voices (#33)

* feat:remote_voices dont depend on piper from pypi improve voice caching/download dynamic voice list from huggingface * Apply suggestions from code review Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * error handling * Update ovos_tts_plugin_piper/download.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * readme * support local models and model urls * test * utils min version --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
OpenVoiceOS · Dec 19, 2024 · b530fb8 · b530fb8
1 parent 31c5eb5
commit b530fb8
Show file tree

Hide file tree

Showing 7 changed files with 6,957 additions and 307 deletions.
diff --git a/README.md b/README.md
@@ -8,13 +8,11 @@
 
 ## Configuration
 
-download models from https://github.com/rhasspy/piper/releases/tag/v0.0.2
+voice models are automatically downloaded from https://huggingface.co/rhasspy/piper-voices into `~/.local/share/piper_tts`
 
-you can also pass an url for a .tar.gz model, and it will be auto downloaded
+full list of voices can be found [here](https://huggingface.co/rhasspy/piper-voices/blob/main/voices.json)
 
-if no model is passed it will be auto selected based on language
-
-you can pass a model name alias, eg "alan-low"
+you can also pass a short name alias without lang code, eg `"alan-low"` instead of `"en_GB-alan-low"`
 
 ```json
   "tts": {
@@ -24,3 +22,28 @@ you can pass a model name alias, eg "alan-low"
     }
   }
 ```
+if no voice is set it will be auto selected based on language
+
+you can also define a local path for your own model
+
+```json
+  "tts": {
+    "module": "ovos-tts-plugin-piper",
+    "ovos-tts-plugin-piper": {
+      "model": "/path/to/model.onnx",
+      "model_config": "/path/to/model.onnx.json"
+    }
+  }
+```
+
+or a remote url
+
+```json
+  "tts": {
+    "module": "ovos-tts-plugin-piper",
+    "ovos-tts-plugin-piper": {
+      "model": "https://huggingface.co/poisson-fish/piper-vasco/resolve/main/onnx/vasco.onnx",
+      "model_config": "https://huggingface.co/poisson-fish/piper-vasco/resolve/main/onnx/vasco.onnx.json"
+    }
+  }
+```
diff --git a/ovos_tts_plugin_piper/__init__.py b/ovos_tts_plugin_piper/__init__.py
diff --git a/ovos_tts_plugin_piper/piper.py b/ovos_tts_plugin_piper/piper.py
@@ -0,0 +1,244 @@
+import json
+import wave
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, Mapping, Sequence, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import onnxruntime
+from ovos_utils.log import LOG
+from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
+
+PAD = "_"  # padding (0)
+BOS = "^"  # beginning of sentence
+EOS = "$"  # end of sentence
+
+
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    TEXT = "text"
+
+
+@dataclass
+class PiperConfig:
+    """Piper configuration"""
+
+    num_symbols: int
+    """Number of phonemes"""
+
+    num_speakers: int
+    """Number of speakers"""
+
+    sample_rate: int
+    """Sample rate of output audio"""
+
+    espeak_voice: str
+    """Name of espeak-ng voice or alphabet"""
+
+    length_scale: float
+    noise_scale: float
+    noise_w: float
+
+    phoneme_id_map: Mapping[str, Sequence[int]]
+    """Phoneme -> [id,]"""
+
+    phoneme_type: PhonemeType
+    """espeak or text"""
+
+    @staticmethod
+    def from_dict(config: Dict[str, Any]) -> "PiperConfig":
+        inference = config.get("inference", {})
+
+        return PiperConfig(
+            num_symbols=config["num_symbols"],
+            num_speakers=config["num_speakers"],
+            sample_rate=config["audio"]["sample_rate"],
+            noise_scale=inference.get("noise_scale", 0.667),
+            length_scale=inference.get("length_scale", 1.0),
+            noise_w=inference.get("noise_w", 0.8),
+            #
+            espeak_voice=config["espeak"]["voice"],
+            phoneme_id_map=config["phoneme_id_map"],
+            phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
+        )
+
+
+def audio_float_to_int16(
+        audio: np.ndarray, max_wav_value: float = 32767.0
+) -> np.ndarray:
+    """Normalize audio and convert to int16 range"""
+    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
+    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
+    audio_norm = audio_norm.astype("int16")
+    return audio_norm
+
+
+@dataclass
+class PiperVoice:
+    session: onnxruntime.InferenceSession
+    config: PiperConfig
+
+    @staticmethod
+    def load(
+            model_path: Union[str, Path],
+            config_path: Optional[Union[str, Path]] = None,
+            use_cuda: bool = False,
+    ) -> "PiperVoice":
+        """Load an ONNX model and config."""
+        if config_path is None:
+            config_path = f"{model_path}.json"
+
+        with open(config_path, "r", encoding="utf-8") as config_file:
+            config_dict = json.load(config_file)
+
+        providers: List[Union[str, Tuple[str, Dict[str, Any]]]]
+        if use_cuda:
+            providers = [
+                (
+                    "CUDAExecutionProvider",
+                    {"cudnn_conv_algo_search": "HEURISTIC"},
+                )
+            ]
+        else:
+            providers = ["CPUExecutionProvider"]
+
+        return PiperVoice(
+            config=PiperConfig.from_dict(config_dict),
+            session=onnxruntime.InferenceSession(
+                str(model_path),
+                sess_options=onnxruntime.SessionOptions(),
+                providers=providers,
+            ),
+        )
+
+    def phonemize(self, text: str) -> List[List[str]]:
+        """Text to phonemes grouped by sentence."""
+        if self.config.phoneme_type == PhonemeType.ESPEAK:
+            if self.config.espeak_voice == "ar":
+                # Arabic diacritization
+                # https://github.com/mush42/libtashkeel/
+                text = tashkeel_run(text)
+
+            return phonemize_espeak(text, self.config.espeak_voice)
+
+        if self.config.phoneme_type == PhonemeType.TEXT:
+            return phonemize_codepoints(text)
+
+        raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
+
+    def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
+        """Phonemes to ids."""
+        id_map = self.config.phoneme_id_map
+        ids: List[int] = list(id_map[BOS])
+
+        for phoneme in phonemes:
+            if phoneme not in id_map:
+                LOG.warning("Missing phoneme from id map: %s", phoneme)
+                continue
+
+            ids.extend(id_map[phoneme])
+            ids.extend(id_map[PAD])
+
+        ids.extend(id_map[EOS])
+
+        return ids
+
+    def synthesize(
+            self,
+            text: str,
+            wav_file: wave.Wave_write,
+            speaker_id: Optional[int] = None,
+            length_scale: Optional[float] = None,
+            noise_scale: Optional[float] = None,
+            noise_w: Optional[float] = None,
+            sentence_silence: float = 0.0,
+    ):
+        """Synthesize WAV audio from text."""
+        wav_file.setframerate(self.config.sample_rate)
+        wav_file.setsampwidth(2)  # 16-bit
+        wav_file.setnchannels(1)  # mono
+
+        for audio_bytes in self.synthesize_stream_raw(
+                text,
+                speaker_id=speaker_id,
+                length_scale=length_scale,
+                noise_scale=noise_scale,
+                noise_w=noise_w,
+                sentence_silence=sentence_silence,
+        ):
+            wav_file.writeframes(audio_bytes)
+
+    def synthesize_stream_raw(
+            self,
+            text: str,
+            speaker_id: Optional[int] = None,
+            length_scale: Optional[float] = None,
+            noise_scale: Optional[float] = None,
+            noise_w: Optional[float] = None,
+            sentence_silence: float = 0.0,
+    ) -> Iterable[bytes]:
+        """Synthesize raw audio per sentence from text."""
+        sentence_phonemes = self.phonemize(text)
+
+        # 16-bit mono
+        num_silence_samples = int(sentence_silence * self.config.sample_rate)
+        silence_bytes = bytes(num_silence_samples * 2)
+
+        for phonemes in sentence_phonemes:
+            phoneme_ids = self.phonemes_to_ids(phonemes)
+            yield self.synthesize_ids_to_raw(
+                phoneme_ids,
+                speaker_id=speaker_id,
+                length_scale=length_scale,
+                noise_scale=noise_scale,
+                noise_w=noise_w,
+            ) + silence_bytes
+
+    def synthesize_ids_to_raw(
+            self,
+            phoneme_ids: List[int],
+            speaker_id: Optional[int] = None,
+            length_scale: Optional[float] = None,
+            noise_scale: Optional[float] = None,
+            noise_w: Optional[float] = None,
+    ) -> bytes:
+        """Synthesize raw audio from phoneme ids."""
+        if length_scale is None:
+            length_scale = self.config.length_scale
+
+        if noise_scale is None:
+            noise_scale = self.config.noise_scale
+
+        if noise_w is None:
+            noise_w = self.config.noise_w
+
+        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+        scales = np.array(
+            [noise_scale, length_scale, noise_w],
+            dtype=np.float32,
+        )
+
+        args = {
+            "input": phoneme_ids_array,
+            "input_lengths": phoneme_ids_lengths,
+            "scales": scales
+        }
+
+        if self.config.num_speakers <= 1:
+            speaker_id = None
+
+        if (self.config.num_speakers > 1) and (speaker_id is None):
+            # Default speaker
+            speaker_id = 0
+
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)
+            args["sid"] = sid  # <- this is the bug fix, upstream passes "sid": None to args
+            # which crashes single speaker models
+
+        # Synthesize through Onnx
+        audio = self.session.run(None, args, )[0].squeeze((0, 1))
+        audio = audio_float_to_int16(audio.squeeze())
+        return audio.tobytes()