From 8585de172409dda3293b68073227e8213f643053 Mon Sep 17 00:00:00 2001 From: jarbasai Date: Fri, 3 Mar 2023 21:16:36 +0000 Subject: [PATCH] migrate ovos-listener --- mycroft/listener/__init__.py | 536 +-------------- mycroft/listener/__main__.py | 2 +- mycroft/listener/data_structures.py | 63 +- mycroft/listener/hotword_factory.py | 17 +- mycroft/listener/mic.py | 998 +--------------------------- mycroft/listener/service.py | 408 +----------- mycroft/listener/silence.py | 424 +----------- mycroft/listener/stt.py | 16 +- 8 files changed, 24 insertions(+), 2440 deletions(-) diff --git a/mycroft/listener/__init__.py b/mycroft/listener/__init__.py index 892a99ad4f2f..53caae29d4f1 100644 --- a/mycroft/listener/__init__.py +++ b/mycroft/listener/__init__.py @@ -1,533 +1,3 @@ -# Copyright 2017 Mycroft AI Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import time -from queue import Queue, Empty -from threading import Thread, Lock - -import pyaudio -from pyee import EventEmitter - -from mycroft.listener.hotword_factory import HotWordFactory -from mycroft.listener.mic import MutableMicrophone, ResponsiveRecognizer, ListenerState -from ovos_config.config import Configuration -from mycroft.metrics import Stopwatch, report_timing -from mycroft.session import SessionManager -from mycroft.listener.stt import STTFactory -from mycroft.util import find_input_device -from ovos_utils.log import LOG - -MAX_MIC_RESTARTS = 20 - -AUDIO_DATA = 0 -STREAM_START = 1 -STREAM_DATA = 2 -STREAM_STOP = 3 - - -class AudioStreamHandler: - def __init__(self, queue): - self.queue = queue - - def stream_start(self): - self.queue.put((STREAM_START, None, None)) - - def stream_chunk(self, chunk, lang=None): - self.queue.put((STREAM_DATA, chunk, lang)) - - def stream_stop(self): - self.queue.put((STREAM_STOP, None, None)) - - -class AudioProducer(Thread): - """AudioProducer - Given a mic and a recognizer implementation, continuously listens to the - mic for potential speech chunks and pushes them onto the queue. - """ - - def __init__(self, loop): - super(AudioProducer, self).__init__() - self.daemon = True - self.loop = loop - self.stream_handler = None - if self.loop.stt.can_stream: - self.stream_handler = AudioStreamHandler(self.loop.queue) - - def run(self): - restart_attempts = 0 - with self.loop.microphone as source: - self.loop.responsive_recognizer.adjust_for_ambient_noise(source) - while self.loop.state.running: - try: - audio, lang = self.loop.responsive_recognizer.listen( - source, self.stream_handler) - if audio is not None: - self.loop.queue.put((AUDIO_DATA, audio, lang)) - else: - LOG.warning("Audio contains no data.") - except IOError as e: - # IOError will be thrown if the read is unsuccessful. - # If self.recognizer.overflow_exc is False (default) - # input buffer overflow IOErrors due to not consuming the - # buffers quickly enough will be silently ignored. - LOG.error('IOError Exception in AudioProducer') - if e.errno == pyaudio.paInputOverflowed: - pass # Ignore overflow errors - elif restart_attempts < MAX_MIC_RESTARTS: - # restart the mic - restart_attempts += 1 - LOG.debug('Restarting the microphone...') - source.restart() - LOG.debug('Restarted...') - else: - LOG.error('Restarting mic doesn\'t seem to work. ' - 'Stopping...') - raise - except Exception: - LOG.exception("error in audio producer") - source.restart() - LOG.debug('Mic Restarted.') - # raise - else: - # Reset restart attempt counter on sucessful audio read - restart_attempts = 0 - finally: - if self.stream_handler is not None: - self.stream_handler.stream_stop() - LOG.info("Loop stopped running") - - def stop(self): - """Stop producer thread.""" - LOG.debug("stopping producer") - self.loop.state.running = False - self.loop.responsive_recognizer.stop() - - -class AudioConsumer(Thread): - """AudioConsumer - Consumes AudioData chunks off the queue - """ - - # In seconds, the minimum audio size to be sent to remote STT - MIN_AUDIO_SIZE = 0.5 - - def __init__(self, loop): - super(AudioConsumer, self).__init__() - self.daemon = True - self.loop = loop - - def run(self): - while self.loop.state.running: - self.read() - - def read(self): - try: - message = self.loop.queue.get(timeout=0.5) - except Empty: - return - - if message is None: - return - - tag, data, lang = message - - if tag == AUDIO_DATA: - if data is not None and not self.loop.state.sleeping: - self.process(data) - elif tag == STREAM_START: - self.loop.stt.stream_start() - elif tag == STREAM_DATA: - self.loop.stt.stream_data(data) - elif tag == STREAM_STOP: - self.loop.stt.stream_stop() - else: - LOG.error("Unknown audio queue type %r" % message) - - @staticmethod - def _audio_length(audio): - return float(len(audio.frame_data)) / ( - audio.sample_rate * audio.sample_width) - - def process(self, audio, lang=None): - if audio is None: - return - - if self._audio_length(audio) < self.MIN_AUDIO_SIZE: - LOG.warning("Audio too short to be processed") - else: - stopwatch = Stopwatch() - with stopwatch: - transcription = self.transcribe(audio, lang) - if transcription: - ident = str(stopwatch.timestamp) + str(hash(transcription)) - - # STT succeeded, send the transcribed speech on for processing - utts = [transcription] - payload = { - 'utterances': utts, - 'lang': self.loop.stt.lang, - 'session': SessionManager.get().session_id, - 'ident': ident - } - self.loop.emit("recognizer_loop:utterance", payload) - - # Report timing metrics - report_timing(ident, 'stt', stopwatch, - {'transcription': transcription, - 'stt': self.loop.stt.__class__.__name__}) - else: - ident = str(stopwatch.timestamp) - - def transcribe(self, audio, lang): - def send_unknown_intent(): - """ Send message that nothing was transcribed. """ - if self.loop.responsive_recognizer.listen_state == ListenerState.WAKEWORD: - self.loop.emit('recognizer_loop:speech.recognition.unknown') - - try: - # Invoke the STT engine on the audio clip - with self.loop.lock: - try: - text = self.loop.stt.execute(audio, language=lang) - except Exception as e: - if self.loop.fallback_stt: - LOG.warning(f"Using fallback STT, main plugin failed: {e}") - text = self.loop.fallback_stt.execute(audio, language=lang) - else: - raise e - if text is not None: - text = text.lower().strip() - LOG.debug("STT: " + text) - else: - send_unknown_intent() - LOG.info('no words were transcribed') - return text - except Exception as e: - send_unknown_intent() - LOG.exception("Speech Recognition could not understand audio") - return None - - -class RecognizerLoopState: - def __init__(self): - self.running = False - self.sleeping = False - - -def recognizer_conf_hash(config): - """Hash of the values important to the listener.""" - c = { - 'listener': config.get('listener'), - 'hotwords': config.get('hotwords'), - 'stt': config.get('stt'), - 'opt_in': config.get('opt_in', False) - } - return hash(json.dumps(c, sort_keys=True)) - - -class RecognizerLoop(EventEmitter): - """ EventEmitter loop running speech recognition. - - Local wake word recognizer and remote general speech recognition. - - Args: - bus (MessageBusClient): mycroft messagebus connection - watchdog: (callable) function to call periodically indicating - operational status. - stt (STT): stt plugin to be used for inference - (optional, can be set later via self.bind ) - """ - - def __init__(self, bus, watchdog=None, stt=None, fallback_stt=None): - super(RecognizerLoop, self).__init__() - self._watchdog = watchdog - self.mute_calls = 0 - self.lock = Lock() - self.stt = stt - self.fallback_stt = fallback_stt - self.bus = bus - self.engines = {} - self.queue = None - self.audio_consumer = None - self.audio_producer = None - self.responsive_recognizer = None - - self.needs_reload = False - self._load_config() - Configuration.set_config_watcher(self._on_config_update) - - def bind(self, stt, fallback_stt=None): - self.stt = stt - if fallback_stt: - self.fallback_stt = fallback_stt - - @property - def wakeup_words(self): - return {k: v for k, v in self.engines.items() - if v.get("wakeup")} - - @property - def listen_words(self): - return {k: v for k, v in self.engines.items() - if v.get("listen")} - - @property - def stop_words(self): - return {k: v for k, v in self.engines.items() - if v.get("stopword")} - - @property - def hot_words(self): - return {k: v for k, v in self.engines.items() - if not v.get("stopword") and not v.get("wakeup")} - - @property - def listen_state(self): - return self.responsive_recognizer.listen_state - - @listen_state.setter - def listen_state(self, val): - self.responsive_recognizer.listen_state = val - - @property - def listen_mode(self): - return self.responsive_recognizer.listen_mode - - @listen_mode.setter - def listen_mode(self, val): - self.responsive_recognizer.listen_mode = val - - def _load_config(self): - """Load configuration parameters from configuration.""" - config = Configuration() - self.config_core = config - self._config_hash = recognizer_conf_hash(config) - self.lang = config.get('lang') - self.config = config.get('listener') - rate = self.config.get('sample_rate') - - device_index = self.config.get('device_index') - device_name = self.config.get('device_name') - retry_mic = self.config.get('retry_mic_init', True) - - if not device_index and device_name: - device_index = find_input_device(device_name) - - LOG.debug('Using microphone (None = default): ' + str(device_index)) - - self.microphone = MutableMicrophone(device_index, rate, - mute=self.mute_calls > 0, - retry=retry_mic) - self.create_hotword_engines() - self.state = RecognizerLoopState() - self.responsive_recognizer = ResponsiveRecognizer(self) - - def create_hotword_engines(self): - LOG.info("creating hotword engines") - hot_words = self.config_core.get("hotwords", {}) - global_listen = self.config_core.get("confirm_listening") - global_sounds = self.config_core.get("sounds", {}) - - main_ww = self.config_core.get("listener", {}).get("wake_word", "hey_mycroft").replace(" ", "_") - wakeupw = self.config_core.get("listener", {}).get("stand_up_word", "wake_up").replace(" ", "_") - - for word, data in dict(hot_words).items(): - try: - # normalization step to avoid naming collisions - # TODO - move this to ovos_config package, on changes to the hotwords section this should be enforced directly - # this approach does not fully solve the issue, config merging may be messed up - word = word.replace(" ", "_") - - sound = data.get("sound") - utterance = data.get("utterance") - listen = data.get("listen", False) - wakeup = data.get("wakeup", False) - stopword = data.get("stopword", False) - trigger = data.get("trigger", False) - lang = data.get("stt_lang", self.lang) - enabled = data.get("active") - event = data.get("bus_event") - - # automatically enable default wake words - # only if the active status is undefined - if enabled is None: - if word == main_ww or word == wakeupw: - enabled = True - else: - enabled = False - - # global listening sound - if not sound and listen and global_listen: - sound = global_sounds.get("start_listening") - - if not enabled: - continue - engine = HotWordFactory.create_hotword(word, - lang=lang, - loop=self) - if engine is not None: - if hasattr(engine, "bind"): - engine.bind(self.bus) - # not all plugins implement this - if data.get('engine'): - LOG.info(f"Engine previously defined. " - f"Deleting old instance.") - try: - data['engine'].stop() - del data['engine'] - except Exception as e: - LOG.error(e) - self.engines[word] = {"engine": engine, - "sound": sound, - "bus_event": event, - "trigger": trigger, - "utterance": utterance, - "stt_lang": lang, - "listen": listen, - "wakeup": wakeup, - "stopword": stopword} - except Exception as e: - LOG.error("Failed to load hotword: " + word) - - @staticmethod - def get_fallback_stt(): - config_core = Configuration() - stt_config = config_core.get('stt', {}) - engine = stt_config.get("fallback_module") - if engine == stt_config.get("module", "mycroft"): - LOG.warning("Fallback STT is the same as default STT") - elif not engine: - LOG.warning("No fallback STT configured") - else: - plugin_config = stt_config.get(engine) or {} - plugin_config["lang"] = plugin_config.get("lang") or \ - config_core.get("lang", "en-us") - clazz = STTFactory.get_class({"module": engine, - engine: plugin_config}) - if clazz: - return clazz - else: - LOG.warning(f"Could not find plugin: {engine}") - LOG.error(f"Failed to create fallback STT") - - def start_async(self): - """Start consumer and producer threads.""" - self.state.running = True - if not self.stt: - self.stt = STTFactory.create() - if not self.fallback_stt: - clazz = self.get_fallback_stt() - if clazz: - self.fallback_stt = clazz() - - self.queue = Queue() - self.audio_consumer = AudioConsumer(self) - self.audio_consumer.start() - self.audio_producer = AudioProducer(self) - self.audio_producer.start() - - def stop(self): - self.state.running = False - if self.audio_producer: - self.audio_producer.stop() - # stop wake word detectors - # NOTE: self.engines can change during iteration due to config changes - # so lets use a copy - for ww, hotword in dict(self.engines).items(): - try: - hotword["engine"].stop() - except: - LOG.exception(f"Failed to stop hotword engine: {ww}") - # wait for threads to shutdown - if self.audio_producer: - self.audio_producer.join(1) - if self.audio_consumer: - self.audio_consumer.join(1) - - def mute(self): - """Mute microphone and increase number of requests to mute.""" - self.mute_calls += 1 - if self.microphone: - self.microphone.mute() - - def unmute(self): - """Unmute mic if as many unmute calls as mute calls have been received. - """ - if self.mute_calls > 0: - self.mute_calls -= 1 - - if self.mute_calls <= 0 and self.microphone: - self.microphone.unmute() - self.mute_calls = 0 - - def force_unmute(self): - """Completely unmute mic regardless of the number of calls to mute.""" - self.mute_calls = 0 - self.unmute() - - def is_muted(self): - if self.microphone: - return self.microphone.is_muted() - else: - return True # consider 'no mic' muted - - def sleep(self): - self.state.sleeping = True - - def awaken(self): - self.state.sleeping = False - - def _on_config_update(self): - current_hash = recognizer_conf_hash(Configuration()) - if current_hash != self._config_hash: - self._config_hash = current_hash - self.needs_reload = True - - def run(self): - """Start and reload mic and STT handling threads as needed. - - Wait for KeyboardInterrupt and shutdown cleanly. - """ - try: - self.start_async() - except Exception: - LOG.exception('Starting producer/consumer threads for listener ' - 'failed.') - return - - # Handle reload of consumer / producer if config changes - while self.state.running: - try: - time.sleep(0.5) - if self.needs_reload: - LOG.debug('Config has changed, reloading...') - self.reload() - except KeyboardInterrupt as e: - LOG.error(e) - self.stop() - raise # Re-raise KeyboardInterrupt - except Exception: - LOG.exception('Exception in RecognizerLoop') - raise - - def reload(self): - """Reload configuration and restart consumer and producer.""" - with self.lock: - self.stop() - # load config - self._load_config() - # restart - self.needs_reload = False - self.start_async() +# backwards compat imports +from ovos_listener import MAX_MIC_RESTARTS,AUDIO_DATA,STREAM_START , STREAM_DATA, STREAM_STOP, \ + AudioStreamHandler, AudioProducer, AudioConsumer, RecognizerLoop, RecognizerLoopState, recognizer_conf_hash diff --git a/mycroft/listener/__main__.py b/mycroft/listener/__main__.py index d463f46d8f66..13f57b16b1a1 100644 --- a/mycroft/listener/__main__.py +++ b/mycroft/listener/__main__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from mycroft.listener.service import SpeechService, on_error, on_stopping, on_ready +from ovos_listener.service import SpeechService, on_error, on_stopping, on_ready from ovos_config.locale import setup_locale from ovos_utils.log import init_service_logger from ovos_utils.process_utils import reset_sigint_handler, PIDLock diff --git a/mycroft/listener/data_structures.py b/mycroft/listener/data_structures.py index 0c48919b2d52..260864a0f8f6 100644 --- a/mycroft/listener/data_structures.py +++ b/mycroft/listener/data_structures.py @@ -1,62 +1,3 @@ -# Copyright 2020 Mycroft AI Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Data structures used by the speech client.""" +# backwards compat imports +from ovos_listener.data_structures import CyclicAudioBuffer from mycroft.deprecated.speech_client import RollingMean - - -class CyclicAudioBuffer: - """A Cyclic audio buffer for storing binary data. - - TODO: The class is still unoptimized and performance can probably be - enhanced. - - Args: - size (int): size in bytes - initial_data (bytes): initial buffer data - """ - - def __init__(self, size, initial_data): - self.size = size - # Get at most size bytes from the end of the initial data - self._buffer = initial_data[-size:] - - def clear(self): - self._buffer = b'\0' * self.size - - def append(self, data): - """Add new data to the buffer, and slide out data if the buffer is full - - Args: - data (bytes): binary data to append to the buffer. If buffer size - is exceeded the oldest data will be dropped. - """ - buff = self._buffer + data - if len(buff) > self.size: - buff = buff[-self.size:] - self._buffer = buff - - def get(self): - """Get the binary data.""" - return self._buffer - - def get_last(self, size): - """Get the last entries of the buffer.""" - return self._buffer[-size:] - - def __getitem__(self, key): - return self._buffer[key] - - def __len__(self): - return len(self._buffer) diff --git a/mycroft/listener/hotword_factory.py b/mycroft/listener/hotword_factory.py index 6cb124dc1c0d..40ed9b96c68f 100644 --- a/mycroft/listener/hotword_factory.py +++ b/mycroft/listener/hotword_factory.py @@ -1,13 +1,4 @@ -from ovos_config.config import Configuration -from ovos_plugin_manager.wakewords import OVOSWakeWordFactory, \ - load_wake_word_plugin, find_wake_word_plugins -from ovos_plugin_manager.templates.hotwords import HotWordEngine - - -class HotWordFactory(OVOSWakeWordFactory): - @classmethod - def create_hotword(cls, hotword="hey mycroft", config=None, - lang="en-us", loop=None): - if not config: - config = Configuration()['hotwords'] - return OVOSWakeWordFactory.create_hotword(hotword, config, lang, loop) +# backwards compat imports +from ovos_listener.hotword_factory import HotWordFactory +from ovos_plugin_manager.wakewords import load_wake_word_plugin, find_wake_word_plugins +from ovos_plugin_manager.templates.hotwords import HotWordEngine \ No newline at end of file diff --git a/mycroft/listener/mic.py b/mycroft/listener/mic.py index 705874231513..2dbc1024ab7a 100644 --- a/mycroft/listener/mic.py +++ b/mycroft/listener/mic.py @@ -1,999 +1,11 @@ -# Copyright 2017 Mycroft AI Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import audioop -import datetime -import itertools -import os -import time -from collections import deque, namedtuple -from enum import Enum -from hashlib import md5 -from os.path import join -from threading import Lock, Event, Thread -from time import sleep, time as get_time - -import pyaudio -import requests -import speech_recognition +# backwards compat imports from speech_recognition import ( Microphone, AudioSource, AudioData ) - - -from ovos_backend_client.api import DeviceApi, DatasetApi -from ovos_config.config import Configuration from mycroft.deprecated.speech_client import NoiseTracker -from mycroft.listener.data_structures import RollingMean, CyclicAudioBuffer -from mycroft.listener.silence import SilenceDetector, SilenceResultType, SilenceMethod -from mycroft.session import SessionManager -from mycroft.util import ( - resolve_resource_file, - play_wav, play_ogg, play_mp3 -) -from mycroft.util.audio_utils import play_listening_sound, play_end_listening_sound -from ovos_utils.log import LOG -from ovos_utils.signal import check_for_signal, get_ipc_directory -from ovos_config.locations import get_xdg_data_save_path -from mycroft.util.audio_utils import play_audio_file -from ovos_plugin_manager.vad import OVOSVADFactory -from ovos_utils.messagebus import get_message_lang - -WakeWordData = namedtuple('WakeWordData', - ['audio', 'found', 'stopped', 'end_audio']) - - -class ListenerState(str, Enum): - """ current listener state """ - WAKEWORD = "wakeword" - CONTINUOUS = "continuous" - RECORDING = "recording" - - -class ListeningMode(str, Enum): - """ global listening mode """ - WAKEWORD = "wakeword" - CONTINUOUS = "continuous" - HYBRID = "hybrid" - - -class MutableStream: - def __init__(self, wrapped_stream, format, muted=False, frames_per_buffer=4000): - assert wrapped_stream is not None - self.wrapped_stream = wrapped_stream - - self.format = format - self.frames_per_buffer = frames_per_buffer - self.SAMPLE_WIDTH = pyaudio.get_sample_size(self.format) - self.bytes_per_buffer = self.frames_per_buffer * self.SAMPLE_WIDTH - self.muted_buffer = b''.join([b'\x00' * self.SAMPLE_WIDTH]) - self.read_lock = Lock() - - self.chunk = bytes(self.bytes_per_buffer) - self.chunk_ready = Event() - - # The size of this queue is important. - # Too small, and chunks could be missed. - # Too large, and there will be a delay in wake word recognition. - self.chunk_deque = deque(maxlen=8) - - self.muted = muted - - # Begin listening - self.wrapped_stream.start_stream() - - def mute(self): - """Stop the stream and set the muted flag.""" - self.muted = True - - def unmute(self): - """Start the stream and clear the muted flag.""" - self.muted = False - - def iter_chunks(self): - with self.read_lock: - while True: - while self.chunk_deque: - yield self.chunk_deque.popleft() - - self.chunk_ready.clear() - self.chunk_ready.wait() - - def read(self, size=1024, of_exc=False): - """Read data from stream. - - Args: - size (int): Number of bytes to read - of_exc (bool): flag determining if the audio producer thread - should throw IOError at overflows. - - Returns: - (bytes) Data read from device - """ - # If muted during read return empty buffer. This ensures no - # reads occur while the stream is stopped - if self.muted: - return self.muted_buffer - - frames = deque() - remaining = size - - for chunk in self.iter_chunks(): - frames.append(chunk) - remaining -= len(chunk) - if remaining <= 0: - break - - audio = b"".join(list(frames)) - return audio - - def close(self): - self.wrapped_stream.stop_stream() - self.wrapped_stream.close() - self.wrapped_stream = None - - def is_stopped(self): - try: - return self.wrapped_stream.is_stopped() - except Exception as e: - LOG.error(repr(e)) - return True # Assume the stream has been closed and thusly stopped - - def stop_stream(self): - return self.wrapped_stream.stop_stream() - - -class MutableMicrophone(Microphone): - def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024, - mute=False, retry=True): - Microphone.__init__(self, device_index=device_index, - sample_rate=sample_rate, chunk_size=chunk_size) - self.muted = False - self.retry_on_mic_error = retry - if mute: - self.mute() - - def __enter__(self): - exit_flag = False - while not exit_flag: - try: - return self._start() - except Exception as e: - if not self.retry_on_mic_error: - raise e - LOG.exception("Can't start mic!") - sleep(1) - - def _stream_callback(self, in_data, frame_count, time_info, status): - """Callback from pyaudio. - - Rather than buffer chunks, we simply assigned the current chunk to the - class instance and signal that it's ready. - """ - self.stream.chunk_deque.append(in_data) - self.stream.chunk_ready.set() - return (None, pyaudio.paContinue) - - def _start(self): - """Open the selected device and setup the stream.""" - assert self.stream is None, \ - "This audio source is already inside a context manager" - self.audio = pyaudio.PyAudio() - - wrapped_stream = self.audio.open( - format=self.format, - frames_per_buffer=self.CHUNK, - stream_callback=self._stream_callback, - input_device_index=self.device_index, - rate=self.SAMPLE_RATE, - channels=1, - input=True # stream is an input stream - ) - - self.stream = MutableStream( - wrapped_stream, - format=self.format, - muted=self.muted, - frames_per_buffer=self.CHUNK - ) - return self - - def __exit__(self, exc_type, exc_value, traceback): - return self._stop() - - def _stop(self): - """Stop and close an open stream.""" - try: - if not self.stream.is_stopped(): - self.stream.stop_stream() - self.stream.close() - except Exception: - LOG.exception('Failed to stop mic input stream') - # Let's pretend nothing is wrong... - - self.stream = None - self.audio.terminate() - - def restart(self): - """Shutdown input device and restart.""" - self._stop() - self._start() - - def mute(self): - self.muted = True - if self.stream: - self.stream.mute() - - def unmute(self): - self.muted = False - if self.stream: - self.stream.unmute() - - def is_muted(self): - return self.muted - - def duration_to_bytes(self, sec): - """Converts a duration in seconds to number of recorded bytes. - - Args: - sec: number of seconds - - Returns: - (int) equivalent number of bytes recorded by this Mic - """ - return int(sec * self.SAMPLE_RATE) * self.SAMPLE_WIDTH - - -def get_silence(num_bytes): - return b'\0' * num_bytes - - -class ResponsiveRecognizer(speech_recognition.Recognizer): - # Padding of silence when feeding to pocketsphinx - SILENCE_SEC = 0.01 - - # The minimum seconds of noise before a - # phrase can be considered complete - MIN_LOUD_SEC_PER_PHRASE = 0.5 - - # The minimum seconds of silence required at the end - # before a phrase will be considered complete - MIN_SILENCE_AT_END = 0.25 - - # Time between pocketsphinx checks for the wake word - SEC_BETWEEN_WW_CHECKS = 0.2 - - def __init__(self, loop, watchdog=None): - self.loop = loop - self._watchdog = watchdog or (lambda: None) # Default to dummy func - self.config = Configuration() - listener_config = self.config.get('listener') or {} - self.instant_listen = listener_config.get("instant_listen", False) - self.listen_timeout = listener_config.get("listen_timeout", 45) - self._listen_ts = 0 - - self.listen_state = ListenerState.WAKEWORD - if listener_config.get("continuous_listen", False): - self.listen_mode = ListeningMode.CONTINUOUS - self.listen_state = ListenerState.CONTINUOUS - elif listener_config.get("hybrid_listen", False): - self.listen_mode = ListeningMode.HYBRID - else: - self.listen_mode = ListeningMode.WAKEWORD - - self.upload_url = listener_config['wake_word_upload']['url'] - self.upload_disabled = listener_config['wake_word_upload']['disable'] - - self.overflow_exc = listener_config.get('overflow_exception', False) - - super().__init__() - self.audio = pyaudio.PyAudio() - self.multiplier = listener_config.get('multiplier') - self.energy_ratio = listener_config.get('energy_ratio') - - # Check the config for the flag to save wake words, utterances - # and for a path under which to save them - self.save_utterances = listener_config.get('save_utterances', False) - self.save_wake_words = listener_config.get('record_wake_words', False) - self.save_path = listener_config.get('save_path', f"{get_xdg_data_save_path()}/listener") - self.saved_wake_words_dir = join(self.save_path, 'wake_words') - if self.save_wake_words: - os.makedirs(self.saved_wake_words_dir, exist_ok=True) - self.saved_utterances_dir = join(self.save_path, 'utterances') - if self.save_utterances: - os.makedirs(self.saved_utterances_dir, exist_ok=True) - - self.saved_recordings_dir = join(self.save_path, 'recordings') - os.makedirs(self.saved_recordings_dir, exist_ok=True) - - # Signal statuses - self._stop_recording = False - self._stop_signaled = False - self._listen_triggered = False - self._waiting_for_wakeup = False - self._last_ww_ts = 0 - - # identifier used when uploading wakewords to selene - self._account_id = None - - # The maximum seconds a phrase can be recorded, - # provided there is noise the entire time - self.recording_timeout = listener_config.get('recording_timeout', 10.0) - # The maximum time it will continue to record silence - # when not enough noise has been detected - self.recording_timeout_with_silence = listener_config.get('recording_timeout_with_silence', 3.0) - # mic meter settings, will write mic level to ipc, used by debug_cli - # NOTE: this writes a lot to disk, it can be problematic in a sd card if you don't use a tmpfs for ipc - ipc = get_ipc_directory() - os.makedirs(ipc, exist_ok=True) - self.mic_level_file = os.path.join(ipc, "mic_level") - self.mic_meter_ipc_enabled = listener_config.get("mic_meter_ipc", True) - - # The maximum audio in seconds to keep for transcribing a phrase - # The wake word must fit in this time - self.test_ww_sec = listener_config.get("test_ww_sec", 3) - - # Use vad for silence detection - vad_config = listener_config.get("VAD", {}) - method = vad_config.get("silence_method", "ratio_only") - for m in SilenceMethod: - if m.lower() == method.lower(): - method = m - break - else: - LOG.error(f"{method} is invalid!") - method = SilenceMethod.VAD_AND_RATIO - LOG.warning(f"Casting silence method to {method}") - - LOG.info(f"VAD method: {method}") - - module = vad_config.get('module') - no_vad = True - if "vad" in method and module: - # dont load plugin if not being used - LOG.info(f"Creating VAD engine: {vad_config.get('module')}") - try: - vad_plugin = OVOSVADFactory.create(vad_config) - no_vad = False - except: - LOG.error("Failed to load VAD plugin!") - - if no_vad: - if "vad" in method: - LOG.warning(f"{method} selected, but VAD plugin not available!") - method = SilenceMethod.RATIO_ONLY - LOG.warning(f"Casting silence method to {method}") - vad_plugin = None - - self.silence_detector = SilenceDetector( - speech_seconds=vad_config.get("speech_seconds", 0.1), - silence_seconds=vad_config.get("silence_seconds", 0.5), - min_seconds=vad_config.get("min_seconds", 1), - max_seconds=self.recording_timeout, - before_seconds=vad_config.get("before_seconds", 0.5), - current_energy_threshold=vad_config.get("initial_energy_threshold", 1000.0), - silence_method=method, - max_current_ratio_threshold=vad_config.get("max_current_ratio_threshold", 2), - plugin=vad_plugin - ) - - @property - def account_id(self): - """Fetch account from backend when needed. - - If an error occurs it's handled and a temporary value is returned. - When a value is received it will be cached until next start. - """ - if not self._account_id: - try: - self._account_id = DeviceApi().get()['user']['uuid'] - except (requests.RequestException, AttributeError): - pass # These are expected and won't be reported - except Exception as e: - LOG.error(f'Unhandled exception while determining device_id, Error: {e}') - - return self._account_id or '0' - - def record_sound_chunk(self, source): - return source.stream.read(source.CHUNK, self.overflow_exc) - - @staticmethod - def calc_energy(sound_chunk, sample_width): - return audioop.rms(sound_chunk, sample_width) - - def feed_hotwords(self, chunk): - """ feed sound chunk to hotword engines that perform - streaming predictions (eg, precise) """ - for ww, hotword in self.loop.hot_words.items(): - hotword["engine"].update(chunk) - - def feed_stopwords(self, chunk): - """ feed sound chunk to stopword engines that perform - streaming predictions (eg, precise) """ - for ww, hotword in self.loop.stop_words.items(): - hotword["engine"].update(chunk) - - def feed_wakeupwords(self, chunk): - """ feed sound chunk to wakeupword engines that perform - streaming predictions (eg, precise) """ - for ww, hotword in self.loop.wakeup_words.items(): - hotword["engine"].update(chunk) - - def _process_hotword(self, audio_data, source, engine, payload, wordtype="hotword"): - """emits a bus event signaling the detection - if mycroft is configured to do so also handles saving to disk and uploading to selene""" - upload_allowed = (self.config['opt_in'] and not self.upload_disabled) - - if upload_allowed or self.save_wake_words: - audio = self._create_audio_data(audio_data, source) - metadata = self._compile_metadata(engine) - - # Save wake word locally - if self.save_wake_words: - filename = self._write_hotword_to_disk(audio, metadata) - payload["filename"] = filename - - # Upload wake word for opt_in people - if upload_allowed: - self._upload_hotword(audio, metadata) - - payload["engine"] = engine.__class__.__name__ - self.loop.emit(f"recognizer_loop:{wordtype}", payload) - - def check_for_wakeup(self, audio_data, source): - # only check for wake up if: - # - a wakeword was detected in previous 5 seconds - # - we are in sleep state - if time.time() - self._last_ww_ts >= 5: - self._waiting_for_wakeup = False - if not self._waiting_for_wakeup or not self.loop.state.sleeping: - return - - try: - for ww, hotword in self.loop.wakeup_words.items(): - if hotword["engine"].found_wake_word(audio_data): - payload = dict(hotword) - payload["hotword"] = ww - self._process_hotword(audio_data, source, - hotword["engine"], payload, - "wakeupword") - self.loop.state.sleeping = False - self.loop.emit('recognizer_loop:awoken') - self._waiting_for_wakeup = False - return True - except RuntimeError: # dictionary changed size during iteration - # seems like config changed and we hit this mid reload! - pass - return False - - def check_for_stop(self, audio_data, source): - # only check for stopwords during recording state - if self.listen_state != ListenerState.RECORDING: - return - try: - for ww, hotword in self.loop.stop_words.items(): - if hotword["engine"].found_wake_word(audio_data): - payload = dict(hotword) - payload["hotword"] = ww - self._process_hotword(audio_data, source, - hotword["engine"], payload, - "stopword") - return True - except RuntimeError: # dictionary changed size during iteration - # seems like config changed and we hit this mid reload! - pass - return False - - def check_for_hotwords(self, audio_data, source): - # check hot word - try: - for ww, hotword in self.loop.hot_words.items(): - if hotword["engine"].found_wake_word(audio_data): - yield ww - except RuntimeError: # dictionary changed size during iteration - # seems like config changed and we hit this mid reload! - pass - - def stop_recording(self): - self._stop_recording = True - - def _record_phrase( - self, - source, - sec_per_buffer, - stream=None, - ww_frames=None - ): - """Record an entire spoken phrase. - - Essentially, this code waits for a period of silence and then returns - the audio. If silence isn't detected, it will terminate and return - a buffer of self.recording_timeout duration. - - Args: - source (AudioSource): Source producing the audio chunks - sec_per_buffer (float): Fractional number of seconds in each chunk - stream (AudioStreamHandler): Stream target that will receive chunks - of the utterance audio while it is - being recorded. - ww_frames (deque): Frames of audio data from the last part of wake - word detection. - - Returns: - bytearray: complete audio buffer recorded, including any - silence at the end of the user's utterance - """ - - num_chunks = 0 - - self.silence_detector.start() - if stream: - stream.stream_start() - for chunk in source.stream.iter_chunks(): - if self._stop_recording or check_for_signal('buttonPress'): - break - - if stream: - stream.stream_chunk(chunk) - result = self.silence_detector.process(chunk) - - if self.listen_state == ListenerState.CONTINUOUS: - if result.type == SilenceResultType.PHRASE_END: - break - elif result.type in {SilenceResultType.PHRASE_END, SilenceResultType.TIMEOUT}: - break - - # Periodically write the energy level to the mic level file. - if num_chunks % 10 == 0: - self._watchdog() - self.write_mic_level(result.energy, source) - num_chunks += 1 - - # if in continuous mode do not include silence before phrase - # if in wake word mode include full audio after wake word - return self.silence_detector.stop(phrase_only=self.listen_state == ListenerState.CONTINUOUS) - - def _record_audio(self, source, sec_per_buffer): - """Record audio until signaled to stop - - recording can be interrupted by: - - button press - - bus event - - max timeout defined in trigger message (TODO) - - configured wake words (stop recording, end recording, the end...) - - Args: - source (AudioSource): Source producing the audio chunks - - Returns: - bytearray: complete audio buffer recorded, including any - silence at the end of the user's utterance - """ - frame_data = bytes() - - # Max bytes for byte_data before audio is removed from the front - max_size = source.duration_to_bytes(self.test_ww_sec) - test_size = max(3, max_size) - num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * - source.SAMPLE_WIDTH) - silence = get_silence(num_silent_bytes) - audio_buffer = CyclicAudioBuffer(max_size, silence) - buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer - buffers_since_check = 0.0 - - for chunk in source.stream.iter_chunks(): - if self._stop_recording or \ - check_for_signal('buttonPress'): - break - frame_data += chunk - - # check for stopwords - buffers_since_check += 1.0 - self.feed_stopwords(chunk) - audio_buffer.append(chunk) - if buffers_since_check > buffers_per_check: - buffers_since_check -= buffers_per_check - audio_data = audio_buffer.get_last(test_size) + silence - if self.check_for_stop(audio_data, source): - break - - audio_data = self._create_audio_data(frame_data, source) - return audio_data - - def write_mic_level(self, energy, source): - if self.mic_meter_ipc_enabled: - with open(self.mic_level_file, 'w') as f: - f.write('Energy: cur={} thresh={:.3f} muted={}'.format( - energy, - self.energy_threshold, - int(source.muted) - ) - ) - - def _skip_wake_word(self): - """Check if told programatically to skip the wake word - - For example when we are in a dialog with the user. - """ - if self._listen_triggered: - self._listen_triggered = False - return True - - # Pressing the Mark 1 button can start recording (unless - # it is being used to mean 'stop' instead) - if check_for_signal('buttonPress', 1): - # give other processes time to consume this signal if - # it was meant to be a 'stop' - sleep(0.25) - if check_for_signal('buttonPress'): - # Signal is still here, assume it was intended to - # begin recording - LOG.debug("Button Pressed, wakeword not needed") - return True - - return False - - def stop(self): - """Signal stop and exit waiting state.""" - self._stop_recording = True - self._stop_signaled = True - - def _compile_metadata(self, engine): - ww_module = engine.__class__.__name__ - if ww_module == 'PreciseHotword': - model_path = engine.precise_model - with open(model_path, 'rb') as f: - model_hash = md5(f.read()).hexdigest() - else: - model_hash = '0' - - return { - 'name': engine.key_phrase, - 'engine': md5(ww_module.encode('utf-8')).hexdigest(), - 'time': str(int(1000 * get_time())), - 'sessionId': SessionManager.get().session_id, - 'accountId': self.account_id, - 'model': str(model_hash) - } - - def trigger_listen(self): - """Externally trigger listening.""" - LOG.info('Listen triggered from external source.') - self._listen_triggered = True - if self.config.get("confirm_listening"): - play_listening_sound() - - def _upload_hotword(self, audio, metadata): - """Upload the wakeword in a background thread.""" - def upload(audio, metadata): - DatasetApi().upload_wake_word(audio.get_wav_data(), metadata, upload_url=self.upload_url) - - Thread(target=upload, daemon=True, args=(audio, metadata)).start() - - def _write_hotword_to_disk(self, audio, metadata): - """Write wakeword to disk. - - Args: - audio: Audio data to write - metadata: List of metadata about the captured wakeword - """ - filename = join(self.saved_wake_words_dir, - '_'.join(str(metadata[k]) for k in sorted(metadata)) + - '.wav') - with open(filename, 'wb') as f: - f.write(audio.get_wav_data()) - return filename - - def _handle_hotword_found(self, hotword, audio_data, source): - """Perform actions to be triggered after a hotword is found. - - This includes: emit event on messagebus that a hotword is heard, - execute hotword specific pre-configured actions, - store hotword to disk if configured and sending the hotword data - to the cloud in case the user has opted into the data sharing. - """ - - engine = self.loop.engines[hotword]["engine"] - sound = self.loop.engines[hotword]["sound"] - utterance = self.loop.engines[hotword]["utterance"] - listen = self.loop.engines[hotword]["listen"] - stt_lang = self.loop.engines[hotword]["stt_lang"] - event = self.loop.engines[hotword]["bus_event"] - - if self.loop.state.sleeping: - if listen: - # start listening for follow up wakeup words - self._waiting_for_wakeup = True - self._last_ww_ts = time.time() - return # no wake word handling during sleep mode - - payload = dict(self.loop.engines[hotword]) - payload["hotword"] = hotword - - if utterance: - LOG.debug("Hotword utterance: " + utterance) - # send the transcribed word on for processing - payload = { - 'utterances': [utterance], - "lang": stt_lang - } - self.loop.emit("recognizer_loop:utterance", payload) - elif listen: - payload["utterance"] = hotword - self._process_hotword(audio_data, source, - engine, payload, "wakeword") - else: - self._process_hotword(audio_data, source, - engine, payload, "hotword") - - if event: - self.loop.emit("recognizer_loop:hotword_event", - {"msg_type": event}) - - # If enabled, play a wave file with a short sound to audibly - # indicate hotword was detected. - if sound: - try: - sound = resolve_resource_file(sound) - if self.instant_listen or not listen: - play_audio_file(sound) - else: - source.mute() - play_audio_file(sound).wait() - source.unmute() - except Exception as e: - LOG.warning(e) - - def _wait_until_wake_word(self, source, sec_per_buffer): - """Listen continuously on source until a wake word is spoken - - Args: - source (AudioSource): Source producing the audio chunks - sec_per_buffer (float): Fractional number of seconds in each chunk - """ - - # The maximum audio in seconds to keep for transcribing a phrase - # The wake word must fit in this time - ww_duration = self.test_ww_sec - ww_test_duration = max(3, ww_duration) - - num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * - source.SAMPLE_WIDTH) - - silence = get_silence(num_silent_bytes) - - # Max bytes for byte_data before audio is removed from the front - max_size = source.duration_to_bytes(ww_duration) - test_size = source.duration_to_bytes(ww_test_duration) - audio_buffer = CyclicAudioBuffer(max_size, silence) - - buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer - buffers_since_check = 0.0 - - # History of audio energies. - # Used to adjust threshold for ambient noise. - energies = [] - energy = 0.0 - mic_write_counter = 0 - - # These are frames immediately after wake word is detected - # that we want to keep to send to STT - ww_frames = deque(maxlen=7) - - audio_data = silence - while not self._stop_signaled: - for chunk in source.stream.iter_chunks(): - if self._skip_wake_word(): - return WakeWordData(audio_data, False, - self._stop_signaled, ww_frames), \ - self.config.get("lang", "en-us") - - if self._stop_signaled: - LOG.info("Stopping") - break - - audio_buffer.append(chunk) - ww_frames.append(chunk) - - buffers_since_check += 1.0 - if self.loop.state.sleeping: - self.feed_wakeupwords(chunk) - self.feed_hotwords(chunk) - if buffers_since_check > buffers_per_check: - buffers_since_check -= buffers_per_check - audio_data = audio_buffer.get_last(test_size) + silence - said_hot_word = False - - # check for wake up command to come out of sleep state - was_wakeup = self.check_for_wakeup(audio_data, source) - - # else check for hotwords - if not was_wakeup: - for hotword in self.check_for_hotwords(audio_data, - source): - said_hot_word = True - listen = self.loop.engines[hotword]["listen"] - stt_lang = self.loop.engines[hotword]["stt_lang"] - self._handle_hotword_found(hotword, audio_data, source) - if listen and not self.loop.state.sleeping: - return WakeWordData(audio_data, True, - self._stop_signaled, - ww_frames), stt_lang - - if said_hot_word: - # reset bytearray to store wake word audio in, else many - # serial detections - audio_buffer.clear() - else: - energy = SilenceDetector.get_debiased_energy(chunk) - energies.append(energy) - - if len(energies) >= 4: - # Adjust energy threshold once per second - # avg_energy = sum(energies) / len(energies) - max_energy = max(energies) - self.silence_detector.current_energy_threshold = max_energy * 2 - - if len(energies) >= 12: - # Clear energy history after 3 seconds - energies = [] - - # Periodically output energy level stats. This can be used to - # visualize the microphone input, e.g. a needle on a meter. - if mic_write_counter % 3: - self._watchdog() - self.write_mic_level(energy, source) - mic_write_counter += 1 - LOG.info("Stopping...") - return WakeWordData(None, False, True, None), "" - - @staticmethod - def _create_audio_data(raw_data, source): - """ - Constructs an AudioData instance with the same parameters - as the source and the specified frame_data - """ - return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) - - def extend_listening(self): - """ reset the timeout until wakeword is needed again - only used when in hybrid listening mode """ - self._listen_ts = time.time() - - def listen(self, source, stream): - """Listens for chunks of audio that Mycroft should perform STT on. - - This will listen continuously for a wake-up-word, then return the - audio chunk containing the spoken phrase that comes immediately - afterwards. - - Args: - source (AudioSource): Source producing the audio chunks - stream (AudioStreamHandler): Stream target that will receive chunks - of the utterance audio while it is - being recorded - - Returns: - (AudioData, lang): audio with the user's utterance (minus the - wake-up-word), stt_lang - """ - assert isinstance(source, AudioSource), "Source must be an AudioSource" - - # bytes_per_sec = source.SAMPLE_RATE * source.SAMPLE_WIDTH - sec_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE - - # Every time a new 'listen()' request begins, reset the threshold - # used for silence detection. This is as good of a reset point as - # any, as we expect the user and Mycroft to not be talking. - # NOTE: adjust_for_ambient_noise() doc claims it will stop early if - # speech is detected, but there is no code to actually do that. - # TODO consider (re)moving this and making configurable - self.adjust_for_ambient_noise(source, 0.3) - - audio_data = None - lang = get_message_lang() - self._stop_recording = False - - if self.listen_state == ListenerState.WAKEWORD: - LOG.debug("Waiting for wake word...") - ww_data, lang = self._wait_until_wake_word(source, sec_per_buffer) - LOG.debug("Done waiting for WW") - if ww_data.stopped or self.loop.state.sleeping: - LOG.debug(f"No data: stopped={ww_data.stopped}") - # If the waiting returned from a stop signal or sleep mode is active - return None, lang - - audio_data = self._listen_phrase(source, sec_per_buffer, stream) - if self.listen_mode != ListeningMode.WAKEWORD: - self.listen_state = ListenerState.CONTINUOUS - self.extend_listening() - - elif self.listen_state == ListenerState.CONTINUOUS: - LOG.debug("Listening...") - audio_data = self._listen_phrase(source, sec_per_buffer, stream) - - # reset to wake word mode if 45 seconds elapsed - if self.listen_mode == ListeningMode.HYBRID and \ - time.time() - self._listen_ts > self.listen_timeout: - self.listen_state = ListenerState.WAKEWORD - - elif self.listen_state == ListenerState.RECORDING: - LOG.debug("Recording...") - self.loop.emit("recognizer_loop:record_begin") - audio_data = self._record_audio(source, sec_per_buffer) - LOG.info("Saving Recording") - # TODO allow name from trigger bus message ? - stamp = str(datetime.datetime.now()) - filename = f"/{self.saved_recordings_dir}/{stamp}.wav" - with open(filename, 'wb') as filea: - filea.write(audio_data.get_wav_data()) - - self.loop.emit("recognizer_loop:record_end", - {"filename": filename}) - - # reset listener state, we dont want to accidentally save 24h of audio per day .... - # experimental setting, no wake word needed - if self.listen_mode == ListeningMode.CONTINUOUS: - self.listen_state = ListenerState.CONTINUOUS - else: - self.listen_state = ListenerState.WAKEWORD - - # recording mode should not trigger STT - return None, lang - - LOG.debug("Thinking...") - play_end_listening_sound() - return audio_data, lang - - def _listen_phrase(self, source, sec_per_buffer, stream): - """ record user utterance and save recording if needed""" - LOG.debug("Recording...") - self.loop.emit("recognizer_loop:record_begin") - frame_data = self._record_phrase(source, sec_per_buffer, stream) - audio_data = self._create_audio_data(frame_data, source) - filename = None - if self.save_utterances: - LOG.info("Saving Utterance Recording") - stamp = str(datetime.datetime.now()) - filename = f"/{self.saved_utterances_dir}/{stamp}.wav" - with open(filename, 'wb') as filea: - filea.write(audio_data.get_wav_data()) - self.loop.emit("recognizer_loop:record_end", {"filename": filename}) - return audio_data - - def adjust_for_ambient_noise(self, source, seconds=1.0): - chunks_per_second = source.CHUNK / source.SAMPLE_RATE - num_chunks = int(seconds / chunks_per_second) - - energies = [] - for chunk in itertools.islice(source.stream.iter_chunks(), num_chunks): - energy = SilenceDetector.get_debiased_energy(chunk) - energies.append(energy) - - if energies: - avg_energy = sum(energies) / len(energies) - self.silence_detector.current_energy_threshold = avg_energy - LOG.info(f"Silence threshold adjusted to {self.silence_detector.current_energy_threshold}") - - def _adjust_threshold(self, energy, seconds_per_buffer): - if self.dynamic_energy_threshold and energy > 0: - # account for different chunk sizes and rates - damping = ( - self.dynamic_energy_adjustment_damping ** seconds_per_buffer) - target_energy = energy * self.energy_ratio - self.energy_threshold = ( - self.energy_threshold * damping + - target_energy * (1 - damping)) +from ovos_listener.data_structures import RollingMean, CyclicAudioBuffer +from ovos_listener.silence import SilenceDetector, SilenceResultType, SilenceMethod +from ovos_listener.mic import WakeWordData, ListenerState, ListeningMode, MutableStream, \ + MutableMicrophone, ResponsiveRecognizer, get_silence diff --git a/mycroft/listener/service.py b/mycroft/listener/service.py index 94f55cbd352c..9b72e35f6791 100644 --- a/mycroft/listener/service.py +++ b/mycroft/listener/service.py @@ -1,408 +1,8 @@ -# Copyright 2017 Mycroft AI Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# +# backwards compat imports +from ovos_listener import RecognizerLoop +from ovos_listener.mic import ListenerState, ListeningMode +from ovos_listener.service import SpeechService, on_ready, on_stopping, on_error -from threading import Thread - -from mycroft.listener import RecognizerLoop -from mycroft.listener.mic import ListenerState, ListeningMode -from ovos_config.config import Configuration -from mycroft.enclosure.api import EnclosureAPI -from ovos_backend_client.identity import IdentityManager -from mycroft.messagebus.message import Message -from mycroft.util import ( - start_message_bus_client -) -from ovos_utils.log import LOG -from ovos_utils.process_utils import ProcessStatus, StatusCallbackMap -from ovos_plugin_manager.stt import get_stt_lang_configs, get_stt_supported_langs, get_stt_module_configs -from ovos_plugin_manager.wakewords import get_ww_lang_configs, get_ww_supported_langs, get_ww_module_configs -from ovos_plugin_manager.vad import get_vad_configs - - -def on_ready(): - LOG.info('Speech client is ready.') - - -def on_stopping(): - LOG.info('Speech service is shutting down...') - - -def on_error(e='Unknown'): - LOG.error('Audio service failed to launch ({}).'.format(repr(e))) - - -class SpeechService(Thread): - def __init__(self, on_ready=on_ready, on_error=on_error, - on_stopping=on_stopping, watchdog=lambda: None, - bus=None, loop=None): - super(SpeechService, self).__init__() - - callbacks = StatusCallbackMap(on_ready=on_ready, - on_error=on_error, - on_stopping=on_stopping) - self.status = ProcessStatus('speech', callback_map=callbacks) - self.status.set_started() - - self.config = Configuration() - self.bus = bus or start_message_bus_client("VOICE") - - self.status.bind(self.bus) - - # Register handlers on internal RecognizerLoop bus - self.loop = loop or RecognizerLoop(self.bus, watchdog) - self.connect_loop_events() - self.connect_bus_events() - - # loop events - def handle_record_begin(self): - """Forward internal bus message to external bus.""" - LOG.info("Begin Recording...") - context = {'client_name': 'mycroft_listener', - 'source': 'audio'} - self.bus.emit(Message('recognizer_loop:record_begin', context=context)) - - def handle_record_end(self, event): - """Forward internal bus message to external bus.""" - LOG.info("End Recording...") - context = {'client_name': 'mycroft_listener', - 'source': 'audio'} - self.bus.emit(Message('recognizer_loop:record_end', event, context=context)) - - def handle_no_internet(self): - LOG.debug("Notifying enclosure of no internet connection") - context = {'client_name': 'mycroft_listener', - 'source': 'audio'} - self.bus.emit(Message('enclosure.notify.no_internet', context=context)) - - def handle_awoken(self): - """Forward mycroft.awoken to the messagebus.""" - LOG.info("Listener is now Awake: ") - context = {'client_name': 'mycroft_listener', - 'source': 'audio'} - self.bus.emit(Message('mycroft.awoken', context=context)) - - def handle_wakeword(self, event): - LOG.info("Wakeword Detected: " + event['hotword']) - self.bus.emit(Message('recognizer_loop:wakeword', event)) - - def handle_hotword(self, event): - LOG.info("Hotword Detected: " + event['hotword']) - self.bus.emit(Message('recognizer_loop:hotword', event)) - - def handle_stopword(self, event): - LOG.info("Stop word Detected: " + event['hotword']) - self.bus.emit(Message('recognizer_loop:stopword', event)) - - def handle_wakeupword(self, event): - LOG.info("WakeUp word Detected: " + event['hotword']) - self.bus.emit(Message('recognizer_loop:wakeupword', event)) - - def handle_hotword_event(self, event): - """ hotword configured to emit a bus event - forward event from internal emitter to mycroft bus""" - self.bus.emit(Message(event["msg_type"])) - - def handle_utterance(self, event): - LOG.info("Utterance: " + str(event['utterances'])) - context = {'client_name': 'mycroft_listener', - 'source': 'audio', - 'destination': ["skills"]} - if 'ident' in event: - ident = event.pop('ident') - context['ident'] = ident - self.bus.emit(Message('recognizer_loop:utterance', event, context)) - - def handle_unknown(self): - context = {'client_name': 'mycroft_listener', - 'source': 'audio'} - self.bus.emit( - Message('mycroft.speech.recognition.unknown', context=context)) - - def handle_speak(self, event): - """ - Forward speak message to message bus. - """ - context = {'client_name': 'mycroft_listener', - 'source': 'audio'} - self.bus.emit(Message('speak', event, context)) - - def handle_complete_intent_failure(self, event): - """DEPRECATED - this handler is no longer called """ - - def handle_change_state(self, event): - """Set listening state.""" - state = event.data.get("state") - mode = event.data.get("mode") - needs_skip = self.loop.listen_state == ListenerState.WAKEWORD - - if state: - if state == ListenerState.WAKEWORD: - self.loop.listen_state = ListenerState.WAKEWORD - elif state == ListenerState.CONTINUOUS: - self.loop.listen_state = ListenerState.CONTINUOUS - elif state == ListenerState.RECORDING: - self.loop.listen_state = ListenerState.RECORDING - else: - LOG.error(f"Invalid listening state: {state}") - - if mode: - if mode == ListeningMode.WAKEWORD: - self.loop.listen_mode = ListeningMode.WAKEWORD - elif mode == ListeningMode.CONTINUOUS: - self.loop.listen_mode = ListeningMode.CONTINUOUS - elif mode == ListeningMode.HYBRID: - self.loop.listen_mode = ListeningMode.HYBRID - else: - LOG.error(f"Invalid listen mode: {mode}") - - # signal the recognizer to stop waiting for a wakeword - # in order for it to enter the new state - if needs_skip: - self.loop.responsive_recognizer._listen_triggered = True - - self.handle_get_state(event) - - def handle_get_state(self, event): - """Query listening state""" - data = {'mode': self.loop.listen_mode, - "state": self.loop.listen_state} - self.bus.emit(event.reply("recognizer_loop:state", data)) - - def handle_stop_recording(self, event): - """Stop current recording session """ - self.loop.responsive_recognizer.stop_recording() - - def handle_extend_listening(self, event): - """ when a skill is activated (converse) reset - the timeout until wakeword is needed again - only used when in hybrid listening mode """ - if self.loop.listen_mode == ListeningMode.HYBRID: - self.loop.responsive_recognizer.extend_listening() - - def handle_sleep(self, event): - """Put the recognizer loop to sleep.""" - self.loop.sleep() - - def handle_wake_up(self, event): - """Wake up the the recognize loop.""" - self.loop.awaken() - - def handle_mic_mute(self, event): - """Mute the listener system.""" - self.loop.mute() - - def handle_mic_unmute(self, event): - """Unmute the listener system.""" - self.loop.unmute() - - def handle_mic_listen(self, _): - """Handler for mycroft.mic.listen. - - Starts listening as if wakeword was spoken. - """ - self.loop.responsive_recognizer.trigger_listen() - - def handle_mic_get_status(self, event): - """Query microphone mute status.""" - data = {'muted': self.loop.is_muted()} - self.bus.emit(event.response(data)) - - def handle_paired(self, event): - """Update identity information with pairing data. - - This is done here to make sure it's only done in a single place. - TODO: Is there a reason this isn't done directly in the pairing skill? - """ - IdentityManager.update(event.data) - - def handle_audio_start(self, event): - """Mute recognizer loop.""" - if self.config.get("listener").get("mute_during_output"): - self.loop.mute() - - def handle_audio_end(self, event): - """Request unmute, if more sources have requested the mic to be muted - it will remain muted. - """ - if self.config.get("listener").get("mute_during_output"): - self.loop.unmute() # restore - - def handle_stop(self, event): - """Handler for mycroft.stop, i.e. button press.""" - self.loop.force_unmute() - - def handle_open(self): - # TODO: Move this into the Enclosure (not speech client) - # Reset the UI to indicate ready for speech processing - EnclosureAPI(self.bus).reset() - - def handle_get_languages_stt(self, message): - """ - Handle a request for supported STT languages - :param message: ovos.languages.stt request - """ - stt_langs = self.loop.stt.available_languages or \ - [self.config.get('lang') or 'en-us'] - LOG.debug(f"Got stt_langs: {stt_langs}") - self.bus.emit(message.response({'langs': list(stt_langs)})) - - @staticmethod - def get_stt_lang_options(lang, blacklist=None): - blacklist = blacklist or [] - opts = [] - cfgs = get_stt_lang_configs(lang=lang, include_dialects=True) - for engine, configs in cfgs.items(): - if engine in blacklist: - continue - # For Display purposes, we want to show the engine name without the underscore or dash and capitalized all - plugin_display_name = engine.replace("_", " ").replace("-", " ").title() - for config in configs: - config["plugin_name"] = plugin_display_name - config["engine"] = engine - config["lang"] = config.get("lang") or lang - opts.append(config) - return opts - - @staticmethod - def get_ww_lang_options(lang, blacklist=None): - blacklist = blacklist or [] - opts = [] - cfgs = get_ww_lang_configs(lang=lang, include_dialects=True) - for engine, configs in cfgs.items(): - if engine in blacklist: - continue - # For Display purposes, we want to show the engine name without the underscore or dash and capitalized all - plugin_display_name = engine.replace("_", " ").replace("-", " ").title() - for config in configs: - config["plugin_name"] = plugin_display_name - config["engine"] = engine - config["lang"] = config.get("lang") or lang - opts.append(config) - return opts - - @staticmethod - def get_vad_options(blacklist=None): - blacklist = blacklist or [] - tts_opts = [] - cfgs = get_vad_configs() - for engine, configs in cfgs.items(): - if engine in blacklist: - continue - # For Display purposes, we want to show the engine name without the underscore or dash and capitalized all - plugin_display_name = engine.replace("_", " ").replace("-", " ").title() - for voice in configs: - voice["plugin_name"] = plugin_display_name - voice["engine"] = engine - tts_opts.append(voice) - return tts_opts - - def handle_opm_stt_query(self, message): - plugs = get_stt_supported_langs() - configs = {} - opts = {} - for lang, m in plugs.items(): - for p in m: - configs[p] = get_stt_module_configs(p) - opts[lang] = self.get_stt_lang_options(lang) - - data = { - "plugins": plugs, - "langs": list(plugs.keys()), - "configs": configs, - "options": opts - } - self.bus.emit(message.response(data)) - - def handle_opm_ww_query(self, message): - plugs = get_ww_supported_langs() - configs = {} - opts = {} - for lang, m in plugs.items(): - for p in m: - configs[p] = get_ww_module_configs(p) - opts[lang] = self.get_ww_lang_options(lang) - - data = { - "plugins": plugs, - "langs": list(plugs.keys()), - "configs": configs, - "options": opts - } - self.bus.emit(message.response(data)) - - def handle_opm_vad_query(self, message): - cfgs = get_vad_configs() - data = { - "plugins": list(cfgs.keys()), - "configs": cfgs, - "options": self.get_vad_options() - } - self.bus.emit(message.response(data)) - - def connect_loop_events(self): - self.loop.on('recognizer_loop:utterance', self.handle_utterance) - self.loop.on('recognizer_loop:speech.recognition.unknown', - self.handle_unknown) - self.loop.on('speak', self.handle_speak) - self.loop.on('recognizer_loop:record_begin', self.handle_record_begin) - self.loop.on('recognizer_loop:awoken', self.handle_awoken) - self.loop.on('recognizer_loop:wakeword', self.handle_wakeword) - self.loop.on('recognizer_loop:hotword', self.handle_hotword) - self.loop.on('recognizer_loop:stopword', self.handle_stopword) - self.loop.on('recognizer_loop:wakeupword', self.handle_wakeupword) - self.loop.on('recognizer_loop:record_end', self.handle_record_end) - self.loop.on('recognizer_loop:no_internet', self.handle_no_internet) - self.loop.on('recognizer_loop:hotword_event', - self.handle_hotword_event) - - def connect_bus_events(self): - # Register handlers for events on main Mycroft messagebus - self.bus.on('open', self.handle_open) - self.bus.on('recognizer_loop:sleep', self.handle_sleep) - self.bus.on('recognizer_loop:wake_up', self.handle_wake_up) - self.bus.on('recognizer_loop:record_stop', self.handle_stop_recording) - self.bus.on('recognizer_loop:state.set', self.handle_change_state) - self.bus.on('recognizer_loop:state.get', self.handle_get_state) - self.bus.on('mycroft.mic.mute', self.handle_mic_mute) - self.bus.on('mycroft.mic.unmute', self.handle_mic_unmute) - self.bus.on('mycroft.mic.get_status', self.handle_mic_get_status) - self.bus.on('mycroft.mic.listen', self.handle_mic_listen) - self.bus.on("mycroft.paired", self.handle_paired) - self.bus.on('recognizer_loop:audio_output_start', - self.handle_audio_start) - self.bus.on('recognizer_loop:audio_output_end', self.handle_audio_end) - self.bus.on('mycroft.stop', self.handle_stop) - self.bus.on("ovos.languages.stt", self.handle_get_languages_stt) - self.bus.on("intent.service.skills.activated", self.handle_extend_listening) - self.bus.on("opm.stt.query", self.handle_opm_stt_query) - self.bus.on("opm.ww.query", self.handle_opm_ww_query) - self.bus.on("opm.vad.query", self.handle_opm_vad_query) - - def run(self): - self.status.set_alive() - try: - self.status.set_ready() - self.loop.run() - except Exception as e: - self.status.set_error(e) - - self.shutdown() - - def shutdown(self): - self.status.set_stopping() - self.loop.stop() class SpeechClient(SpeechService): diff --git a/mycroft/listener/silence.py b/mycroft/listener/silence.py index 03290e60972a..91617959d64c 100644 --- a/mycroft/listener/silence.py +++ b/mycroft/listener/silence.py @@ -1,422 +1,2 @@ -# Copyright 2022 Mycroft AI Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import audioop -import math -import typing -from collections import deque -from dataclasses import dataclass -from enum import Enum - - -class SilenceMethod(str, Enum): - """Method used to determine if an audio frame contains silence. - - Values - ------ - VAD_ONLY - Only use webrtcvad - - RATIO_ONLY - Only use max/current energy ratio threshold - - CURRENT_ONLY - Only use current energy threshold - - VAD_AND_RATIO - Use webrtcvad and max/current energy ratio threshold - - VAD_AND_CURRENT - Use webrtcvad and current energy threshold - - ALL - Use webrtcvad, max/current energy ratio, and current energy threshold - """ - - VAD_ONLY = "vad_only" - RATIO_ONLY = "ratio_only" - CURRENT_ONLY = "current_only" - VAD_AND_RATIO = "vad_and_ratio" - VAD_AND_CURRENT = "vad_and_current" - ALL = "all" - - -class SilenceResultType(str, Enum): - SILENCE = "silence" - SPEECH = "speech" - TIMEOUT = "timeout" - - PHRASE_START = "phrase_start" - PHRASE_END = "phrase_end" - - -@dataclass -class SilenceResult: - type: SilenceResultType - energy: float - - -# ----------------------------------------------------------------------------- - - -class SilenceDetector: - """Detect speech/silence using Silero VAD. - - Attributes - ---------- - vad_threshold: float = 0.2 - Value in [0-1], below which is considered silence - - sample_rate: int = 16000 - Sample rate of audio chunks (hertz) - - chunk_size: int = 960 - Must be 30, 60, or 100 ms in duration - - skip_seconds: float = 0 - Seconds of audio to skip before voice command detection starts - - speech_seconds: float = 0.3 - Seconds of speech before voice command has begun - - before_seconds: float = 0.5 - Seconds of audio to keep before voice command has begun - - min_seconds: float = 1.0 - Minimum length of voice command (seconds) - - max_seconds: Optional[float] = 30.0 - Maximum length of voice command before timeout (seconds, None for no timeout) - - silence_seconds: float = 0.5 - Seconds of silence before a voice command has finished - - max_energy: Optional[float] = None - Maximum denoise energy value (None for dynamic setting from observed audio) - - max_current_ratio_threshold: Optional[float] = None - Ratio of max/current energy below which audio is considered speech - - current_energy_threshold: Optional[float] = None - Energy threshold above which audio is considered speech - - silence_method: SilenceMethod = "vad_only" - Method for deciding if an audio chunk contains silence or speech - """ - - def __init__( - self, - sample_rate: int = 16000, - chunk_size: int = 960, - skip_seconds: float = 0, - min_seconds: float = 1, - max_seconds: typing.Optional[float] = 30, - speech_seconds: float = 0.3, - silence_seconds: float = 0.5, - before_seconds: float = 0.5, - max_energy: typing.Optional[float] = None, - max_current_ratio_threshold: typing.Optional[float] = None, - current_energy_threshold: typing.Optional[float] = None, - silence_method: SilenceMethod = SilenceMethod.VAD_ONLY, - plugin: str = None - ): - self.sample_rate = sample_rate - self.sample_width = 2 # 16-bit - self.sample_channels = 1 # mono - self.chunk_size = chunk_size - self.skip_seconds = skip_seconds - self.min_seconds = min_seconds - self.max_seconds = max_seconds - self.speech_seconds = speech_seconds - self.silence_seconds = silence_seconds - self.before_seconds = before_seconds - - self.max_energy = max_energy - self.dynamic_max_energy = max_energy is None - self.max_current_ratio_threshold = max_current_ratio_threshold - self.current_energy_threshold = current_energy_threshold - self.silence_method = silence_method - - # Verify settings - if self.silence_method in [ - SilenceMethod.VAD_ONLY, - SilenceMethod.VAD_AND_RATIO, - SilenceMethod.VAD_AND_CURRENT, - SilenceMethod.ALL, - ]: - self.use_vad = True - else: - self.use_vad = False - - if self.silence_method in [ - SilenceMethod.VAD_AND_RATIO, - SilenceMethod.RATIO_ONLY, - SilenceMethod.ALL, - ]: - self.use_ratio = True - assert ( - self.max_current_ratio_threshold is not None - ), "Max/current ratio threshold is required" - else: - self.use_ratio = False - - if self.silence_method in [ - SilenceMethod.VAD_AND_CURRENT, - SilenceMethod.CURRENT_ONLY, - SilenceMethod.ALL, - ]: - self.use_current = True - assert ( - self.current_energy_threshold is not None - ), "Current energy threshold is required" - else: - self.use_current = False - - # Voice detector - self.vad = plugin - if not plugin: - self.use_vad = False - if not self.use_current and not self.use_ratio: - self.use_ratio = True - - self.seconds_per_buffer = ( - self.chunk_size / self.sample_width - ) / self.sample_rate - - # Store some number of seconds of audio data immediately before voice command starts - self.before_buffers = int( - math.ceil(self.before_seconds / self.seconds_per_buffer) - ) - - # Pre-compute values - self.speech_buffers = int( - math.ceil(self.speech_seconds / self.seconds_per_buffer) - ) - - self.skip_buffers = int(math.ceil(self.skip_seconds / self.seconds_per_buffer)) - - # State - self.before_phrase_chunks: typing.Deque[bytes] = deque( - maxlen=self.before_buffers - ) - self.phrase_buffer: bytes = bytes() - - self.max_buffers: typing.Optional[int] = None - self.min_phrase_buffers: int = 0 - self.skip_buffers_left: int = 0 - self.speech_buffers_left: int = 0 - self.in_phrase: bool = False - self.after_phrase: bool = False - self.silence_buffers: int = 0 - self.current_seconds: float = 0 - self.current_chunk: bytes = bytes() - - def start(self): - """Begin new voice command.""" - - # State - self.before_phrase_chunks.clear() - self.phrase_buffer = bytes() - - if self.max_seconds: - self.max_buffers = int( - math.ceil(self.max_seconds / self.seconds_per_buffer) - ) - else: - self.max_buffers = None - - self.min_phrase_buffers = int( - math.ceil(self.min_seconds / self.seconds_per_buffer) - ) - - self.speech_buffers_left = self.speech_buffers - self.skip_buffers_left = self.skip_buffers - self.in_phrase = False - self.after_phrase = False - self.silence_buffers = int( - math.ceil(self.silence_seconds / self.seconds_per_buffer) - ) - - self.current_seconds: float = 0 - - self.current_chunk: bytes = bytes() - - def stop(self, phrase_only=False) -> bytes: - """Free any resources and return recorded audio.""" - before_buffer = bytes() - for before_chunk in self.before_phrase_chunks: - before_buffer += before_chunk - - if phrase_only: - # TODO is 5 a good magic number ? - # the aim is to include just a tiny bit of silence - # and avoid super long recordings to account - # for non streaming STT - before_buffer = before_chunk[-5:] - - audio_data = before_buffer + self.phrase_buffer - - # Clear state - self.before_phrase_chunks.clear() - self.phrase_buffer = bytes() - self.current_chunk = bytes() - - # Return leftover audio - return audio_data - - def process(self, audio_chunk: bytes) -> SilenceResult: - """Process a single chunk of audio data.""" - result: typing.Optional[SilenceResult] = None - is_speech = False - energy = SilenceDetector.get_debiased_energy(audio_chunk) - - # Add to overall buffer - self.current_chunk += audio_chunk - - # Process audio in exact chunk(s) - while len(self.current_chunk) > self.chunk_size: - # Extract chunk - chunk = self.current_chunk[: self.chunk_size] - self.current_chunk = self.current_chunk[self.chunk_size :] - - if self.skip_buffers_left > 0: - # Skip audio at beginning - self.skip_buffers_left -= 1 - continue - - if self.in_phrase: - self.phrase_buffer += chunk - else: - self.before_phrase_chunks.append(chunk) - - self.current_seconds += self.seconds_per_buffer - - # Check maximum number of seconds to record - if self.max_buffers: - self.max_buffers -= 1 - if self.max_buffers <= 0: - # Timeout - return SilenceResult(type=SilenceResultType.TIMEOUT, energy=energy) - - # Detect speech in chunk - is_speech = not self.is_silence(chunk, energy=energy) - - # Handle state changes - if is_speech and self.speech_buffers_left > 0: - self.speech_buffers_left -= 1 - elif is_speech and not self.in_phrase: - # Start of phrase - result = SilenceResult( - type=SilenceResultType.PHRASE_START, energy=energy - ) - - self.in_phrase = True - self.after_phrase = False - self.min_phrase_buffers = int( - math.ceil(self.min_seconds / self.seconds_per_buffer) - ) - elif self.in_phrase and (self.min_phrase_buffers > 0): - # In phrase, before minimum seconds - self.min_phrase_buffers -= 1 - elif not is_speech: - # Outside of speech - if not self.in_phrase: - # Reset - self.speech_buffers_left = self.speech_buffers - elif self.after_phrase and (self.silence_buffers > 0): - # After phrase, before stop - self.silence_buffers -= 1 - elif self.after_phrase and (self.silence_buffers <= 0): - # Phrase complete - # Merge before/during command audio data - before_buffer = bytes() - for before_chunk in self.before_phrase_chunks: - before_buffer += before_chunk - - return SilenceResult( - type=SilenceResultType.PHRASE_END, energy=energy - ) - elif self.in_phrase and (self.min_phrase_buffers <= 0): - # Transition to after phrase - self.after_phrase = True - self.silence_buffers = int( - math.ceil(self.silence_seconds / self.seconds_per_buffer) - ) - - if result is None: - # Report speech/silence - result = SilenceResult( - type=SilenceResultType.SPEECH - if is_speech - else SilenceResultType.SILENCE, - energy=energy, - ) - - return result - - # ------------------------------------------------------------------------- - - def is_silence(self, chunk: bytes, energy: typing.Optional[float] = None) -> bool: - """True if audio chunk contains silence.""" - all_silence = True - - if self.use_vad: - assert self.vad is not None - all_silence = all_silence and self.vad.is_silence(chunk) - - if self.use_ratio or self.use_current: - # Compute debiased energy of audio chunk - if energy is None: - energy = SilenceDetector.get_debiased_energy(chunk) - - if self.use_ratio: - # Ratio of max/current energy compared to threshold - if self.dynamic_max_energy: - # Overwrite max energy - if self.max_energy is None: - self.max_energy = energy - else: - self.max_energy = max(energy, self.max_energy) - - assert self.max_energy is not None - if energy > 0: - ratio = self.max_energy / energy - else: - # Not sure what to do here - ratio = 0 - - assert self.max_current_ratio_threshold is not None - all_silence = all_silence and (ratio > self.max_current_ratio_threshold) - elif self.use_current: - # Current energy compared to threshold - assert self.current_energy_threshold is not None - all_silence = all_silence and (energy < self.current_energy_threshold) - - return all_silence - - # ------------------------------------------------------------------------- - - @staticmethod - def get_debiased_energy(audio_data: bytes) -> float: - """Compute RMS of debiased audio.""" - # Thanks to the speech_recognition library! - # https://github.com/Uberi/speech_recognition/blob/master/speech_recognition/__init__.py - energy = -audioop.rms(audio_data, 2) - energy_bytes = bytes([energy & 0xFF, (energy >> 8) & 0xFF]) - debiased_energy = audioop.rms( - audioop.add(audio_data, energy_bytes * (len(audio_data) // 2), 2), 2 - ) - - # Probably actually audio if > 30 - return debiased_energy +# backwards compat imports +from ovos_listener.silence import SilenceMethod, SilenceResult, SilenceResultType, SilenceDetect \ No newline at end of file diff --git a/mycroft/listener/stt.py b/mycroft/listener/stt.py index b64aba8fc6aa..e581a44349c9 100644 --- a/mycroft/listener/stt.py +++ b/mycroft/listener/stt.py @@ -1,13 +1,3 @@ -from ovos_config.config import Configuration -from ovos_utils.log import LOG - -from ovos_plugin_manager.stt import OVOSSTTFactory, load_stt_plugin - - -class STTFactory(OVOSSTTFactory): - @staticmethod - def create(config=None): - config = config or Configuration().get("stt", {}) - module = config.get("module", "ovos-stt-plugin-selene") - LOG.info(f"Creating STT engine: {module}") - return OVOSSTTFactory.create(config) +# backwards compat imports +from ovos_listener.stt import STTFactory +from ovos_plugin_manager.stt import load_stt_plugin