Skip to content

Commit

Permalink
feat:remote_voices (#33)
Browse files Browse the repository at this point in the history
* feat:remote_voices

dont depend on piper from pypi

improve voice caching/download

dynamic voice list from huggingface

* Apply suggestions from code review

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

* error handling

* Update ovos_tts_plugin_piper/download.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

* readme

* support local models and model urls

* test

* utils min version

---------

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
  • Loading branch information
JarbasAl and coderabbitai[bot] authored Dec 19, 2024
1 parent 31c5eb5 commit b530fb8
Show file tree
Hide file tree
Showing 7 changed files with 6,957 additions and 307 deletions.
33 changes: 28 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@

## Configuration

download models from https://github.com/rhasspy/piper/releases/tag/v0.0.2
voice models are automatically downloaded from https://huggingface.co/rhasspy/piper-voices into `~/.local/share/piper_tts`

you can also pass an url for a .tar.gz model, and it will be auto downloaded
full list of voices can be found [here](https://huggingface.co/rhasspy/piper-voices/blob/main/voices.json)

if no model is passed it will be auto selected based on language

you can pass a model name alias, eg "alan-low"
you can also pass a short name alias without lang code, eg `"alan-low"` instead of `"en_GB-alan-low"`

```json
"tts": {
Expand All @@ -24,3 +22,28 @@ you can pass a model name alias, eg "alan-low"
}
}
```
if no voice is set it will be auto selected based on language

you can also define a local path for your own model

```json
"tts": {
"module": "ovos-tts-plugin-piper",
"ovos-tts-plugin-piper": {
"model": "/path/to/model.onnx",
"model_config": "/path/to/model.onnx.json"
}
}
```

or a remote url

```json
"tts": {
"module": "ovos-tts-plugin-piper",
"ovos-tts-plugin-piper": {
"model": "https://huggingface.co/poisson-fish/piper-vasco/resolve/main/onnx/vasco.onnx",
"model_config": "https://huggingface.co/poisson-fish/piper-vasco/resolve/main/onnx/vasco.onnx.json"
}
}
```
380 changes: 79 additions & 301 deletions ovos_tts_plugin_piper/__init__.py

Large diffs are not rendered by default.

244 changes: 244 additions & 0 deletions ovos_tts_plugin_piper/piper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
import json
import wave
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Any, Dict, Mapping, Sequence, Iterable, List, Optional, Tuple, Union

import numpy as np
import onnxruntime
from ovos_utils.log import LOG
from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run

PAD = "_" # padding (0)
BOS = "^" # beginning of sentence
EOS = "$" # end of sentence


class PhonemeType(str, Enum):
ESPEAK = "espeak"
TEXT = "text"


@dataclass
class PiperConfig:
"""Piper configuration"""

num_symbols: int
"""Number of phonemes"""

num_speakers: int
"""Number of speakers"""

sample_rate: int
"""Sample rate of output audio"""

espeak_voice: str
"""Name of espeak-ng voice or alphabet"""

length_scale: float
noise_scale: float
noise_w: float

phoneme_id_map: Mapping[str, Sequence[int]]
"""Phoneme -> [id,]"""

phoneme_type: PhonemeType
"""espeak or text"""

@staticmethod
def from_dict(config: Dict[str, Any]) -> "PiperConfig":
inference = config.get("inference", {})

return PiperConfig(
num_symbols=config["num_symbols"],
num_speakers=config["num_speakers"],
sample_rate=config["audio"]["sample_rate"],
noise_scale=inference.get("noise_scale", 0.667),
length_scale=inference.get("length_scale", 1.0),
noise_w=inference.get("noise_w", 0.8),
#
espeak_voice=config["espeak"]["voice"],
phoneme_id_map=config["phoneme_id_map"],
phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
)


def audio_float_to_int16(
audio: np.ndarray, max_wav_value: float = 32767.0
) -> np.ndarray:
"""Normalize audio and convert to int16 range"""
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
audio_norm = audio_norm.astype("int16")
return audio_norm


@dataclass
class PiperVoice:
session: onnxruntime.InferenceSession
config: PiperConfig

@staticmethod
def load(
model_path: Union[str, Path],
config_path: Optional[Union[str, Path]] = None,
use_cuda: bool = False,
) -> "PiperVoice":
"""Load an ONNX model and config."""
if config_path is None:
config_path = f"{model_path}.json"

with open(config_path, "r", encoding="utf-8") as config_file:
config_dict = json.load(config_file)

providers: List[Union[str, Tuple[str, Dict[str, Any]]]]
if use_cuda:
providers = [
(
"CUDAExecutionProvider",
{"cudnn_conv_algo_search": "HEURISTIC"},
)
]
else:
providers = ["CPUExecutionProvider"]

return PiperVoice(
config=PiperConfig.from_dict(config_dict),
session=onnxruntime.InferenceSession(
str(model_path),
sess_options=onnxruntime.SessionOptions(),
providers=providers,
),
)

def phonemize(self, text: str) -> List[List[str]]:
"""Text to phonemes grouped by sentence."""
if self.config.phoneme_type == PhonemeType.ESPEAK:
if self.config.espeak_voice == "ar":
# Arabic diacritization
# https://github.com/mush42/libtashkeel/
text = tashkeel_run(text)

return phonemize_espeak(text, self.config.espeak_voice)

if self.config.phoneme_type == PhonemeType.TEXT:
return phonemize_codepoints(text)

raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")

def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
"""Phonemes to ids."""
id_map = self.config.phoneme_id_map
ids: List[int] = list(id_map[BOS])

for phoneme in phonemes:
if phoneme not in id_map:
LOG.warning("Missing phoneme from id map: %s", phoneme)
continue

ids.extend(id_map[phoneme])
ids.extend(id_map[PAD])

ids.extend(id_map[EOS])

return ids

def synthesize(
self,
text: str,
wav_file: wave.Wave_write,
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
sentence_silence: float = 0.0,
):
"""Synthesize WAV audio from text."""
wav_file.setframerate(self.config.sample_rate)
wav_file.setsampwidth(2) # 16-bit
wav_file.setnchannels(1) # mono

for audio_bytes in self.synthesize_stream_raw(
text,
speaker_id=speaker_id,
length_scale=length_scale,
noise_scale=noise_scale,
noise_w=noise_w,
sentence_silence=sentence_silence,
):
wav_file.writeframes(audio_bytes)

def synthesize_stream_raw(
self,
text: str,
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
sentence_silence: float = 0.0,
) -> Iterable[bytes]:
"""Synthesize raw audio per sentence from text."""
sentence_phonemes = self.phonemize(text)

# 16-bit mono
num_silence_samples = int(sentence_silence * self.config.sample_rate)
silence_bytes = bytes(num_silence_samples * 2)

for phonemes in sentence_phonemes:
phoneme_ids = self.phonemes_to_ids(phonemes)
yield self.synthesize_ids_to_raw(
phoneme_ids,
speaker_id=speaker_id,
length_scale=length_scale,
noise_scale=noise_scale,
noise_w=noise_w,
) + silence_bytes

def synthesize_ids_to_raw(
self,
phoneme_ids: List[int],
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
) -> bytes:
"""Synthesize raw audio from phoneme ids."""
if length_scale is None:
length_scale = self.config.length_scale

if noise_scale is None:
noise_scale = self.config.noise_scale

if noise_w is None:
noise_w = self.config.noise_w

phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
scales = np.array(
[noise_scale, length_scale, noise_w],
dtype=np.float32,
)

args = {
"input": phoneme_ids_array,
"input_lengths": phoneme_ids_lengths,
"scales": scales
}

if self.config.num_speakers <= 1:
speaker_id = None

if (self.config.num_speakers > 1) and (speaker_id is None):
# Default speaker
speaker_id = 0

if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
args["sid"] = sid # <- this is the bug fix, upstream passes "sid": None to args
# which crashes single speaker models

# Synthesize through Onnx
audio = self.session.run(None, args, )[0].squeeze((0, 1))
audio = audio_float_to_int16(audio.squeeze())
return audio.tobytes()
Loading

0 comments on commit b530fb8

Please sign in to comment.