diff --git a/openwakeword/model.py b/openwakeword/model.py index 6ae820c..6029963 100755 --- a/openwakeword/model.py +++ b/openwakeword/model.py @@ -224,10 +224,13 @@ def get_parent_model_from_label(self, label): return parent_model def reset(self): - """Reset the prediction buffer""" + """Reset the prediction and audio feature buffers. Useful for re-initializing the model, though may not be efficient + when called too frequently.""" self.prediction_buffer = defaultdict(partial(deque, maxlen=30)) + self.preprocessor.reset() - def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timing: bool = False): + def predict(self, x: np.ndarray, patience: dict = {}, + threshold: dict = {}, debounce_time: float = 0.0, timing: bool = False): """Predict with all of the wakeword models on the input audio frames Args: @@ -242,9 +245,11 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi model names and the values are the number of frames. Can reduce false-positive detections at the cost of a lower true-positive rate. By default, this behavior is disabled. - threshold (dict): The threshold values to use when the `patience` behavior is enabled. + threshold (dict): The threshold values to use when the `patience` or `debounce_time` behavior is enabled. Must be provided as an a dictionary where the keys are the model names and the values are the thresholds. + debounce_time (float): The time (in seconds) to wait before returning another non-zero prediction + after a non-zero prediction. Can preven multiple detections of the same wake-word. timing (bool): Whether to return timing information of the models. Can be useful to debug and assess how efficiently models are running on the current hardware. @@ -322,27 +327,40 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi )[0][-1] predictions[cls] = verifier_prediction - # Update prediction buffer, and zero predictions for first 5 frames during model initialization + # Zero predictions for first 5 frames during model initialization for cls in predictions.keys(): if len(self.prediction_buffer[cls]) < 5: predictions[cls] = 0.0 - self.prediction_buffer[cls].append(predictions[cls]) # Get timing information if timing: timing_dict["models"][mdl] = time.time() - model_start # Update scores based on thresholds or patience arguments - if patience != {}: + if patience != {} or debounce_time > 0: if threshold == {}: raise ValueError("Error! When using the `patience` argument, threshold " "values must be provided via the `threshold` argument!") + if patience != {} and debounce_time > 0: + raise ValueError("Error! The `patience` and `debounce_time` arguments cannot be used together!") for mdl in predictions.keys(): parent_model = self.get_parent_model_from_label(mdl) - if parent_model in patience.keys(): - scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:] - if (scores >= threshold[parent_model]).sum() < patience[parent_model]: - predictions[mdl] = 0.0 + if predictions[mdl] != 0.0: + if parent_model in patience.keys(): + scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:] + if (scores >= threshold[parent_model]).sum() < patience[parent_model]: + predictions[mdl] = 0.0 + elif debounce_time > 0: + if parent_model in threshold.keys(): + n_frames = int(np.ceil(debounce_time/(n_prepared_samples/16000))) + recent_predictions = np.array(self.prediction_buffer[mdl])[-n_frames:] + if predictions[mdl] >= threshold[parent_model] and \ + (recent_predictions >= threshold[parent_model]).sum() > 0: + predictions[mdl] = 0.0 + + # Update prediction buffer + for mdl in predictions.keys(): + self.prediction_buffer[mdl].append(predictions[mdl]) # (optionally) get voice activity detection scores and update model scores if self.vad_threshold > 0: diff --git a/openwakeword/utils.py b/openwakeword/utils.py index 8da8048..4964706 100644 --- a/openwakeword/utils.py +++ b/openwakeword/utils.py @@ -160,7 +160,7 @@ def tflite_embedding_predict(x): self.embedding_model_predict = tflite_embedding_predict - # Create databuffers + # Create databuffers with empty/random data self.raw_data_buffer: Deque = deque(maxlen=sr*10) self.melspectrogram_buffer = np.ones((76, 32)) # n_frames x num_features self.melspectrogram_max_len = 10*97 # 97 is the number of frames in 1 second of 16hz audio @@ -169,6 +169,14 @@ def tflite_embedding_predict(x): self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16)) self.feature_buffer_max_len = 120 # ~10 seconds of feature buffer history + def reset(self): + """Reset the internal buffers""" + self.raw_data_buffer.clear() + self.melspectrogram_buffer = np.ones((76, 32)) + self.accumulated_samples = 0 + self.raw_data_remainder = np.empty(0) + self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16)) + def _get_melspectrogram(self, x: Union[np.ndarray, List], melspec_transform: Callable = lambda x: x/10 + 2): """ Function to compute the mel-spectrogram of the provided audio samples. diff --git a/tests/test_models.py b/tests/test_models.py index e728065..b3907ff 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -39,6 +39,7 @@ import pickle import tempfile import mock +import wave # Download models needed for tests openwakeword.utils.download_models() @@ -208,6 +209,48 @@ def test_models_with_speex_noise_cancellation(self): ) assert 1 == 1 + def test_models_with_debounce(self): + # Load model with defaults + owwModel = openwakeword.Model() + + # Predict with chunks of 1280 with and without debounce + predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), + debounce_time=0, threshold={"alexa_v0.1": 0.5}) + scores = np.array([i['alexa'] for i in predictions]) + + predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), + debounce_time=1.25, threshold={"alexa": 0.5}) + scores_with_debounce = np.array([i['alexa'] for i in predictions]) + print(scores, scores_with_debounce) + assert (scores >= 0.5).sum() > 1 + assert (scores_with_debounce >= 0.5).sum() == 1 + + def test_model_reset(self): + # Load the model + owwModel = openwakeword.Model() + + # Get test clip and load it + clip = os.path.join("tests", "data", "alexa_test.wav") + with wave.open(clip, mode='rb') as f: + data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16) + + # Predict frame by frame + for i in range(0, len(data), 1280): + prediction = owwModel.predict(data[i:i+1280]) + if prediction['alexa'] > 0.5: + break + + # Assert that next prediction is still > 0.5 + prediction = owwModel.predict(data[i:i+1280]) + assert prediction['alexa'] > 0.5 + + # Reset the model + owwModel.reset() + + # Assert that next prediction is < 0.5 + prediction = owwModel.predict(data[i:i+1280]) + assert prediction['alexa'] < 0.5 + def test_models_with_vad(self): # Load model with defaults owwModel = openwakeword.Model(vad_threshold=0.5)