From c63384489e48040f0a7f30a2e5d3ba4bb047c6c7 Mon Sep 17 00:00:00 2001 From: dscripka Date: Sun, 11 Feb 2024 12:04:41 -0500 Subject: [PATCH 1/4] Added basic debounce logic for model.predict --- openwakeword/model.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/openwakeword/model.py b/openwakeword/model.py index 6ae820c..97d1303 100755 --- a/openwakeword/model.py +++ b/openwakeword/model.py @@ -227,7 +227,8 @@ def reset(self): """Reset the prediction buffer""" self.prediction_buffer = defaultdict(partial(deque, maxlen=30)) - def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timing: bool = False): + def predict(self, x: np.ndarray, patience: dict = {}, + threshold: dict = {}, debounce_time: float = 0.0, timing: bool = False): """Predict with all of the wakeword models on the input audio frames Args: @@ -242,9 +243,11 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi model names and the values are the number of frames. Can reduce false-positive detections at the cost of a lower true-positive rate. By default, this behavior is disabled. - threshold (dict): The threshold values to use when the `patience` behavior is enabled. + threshold (dict): The threshold values to use when the `patience` or `debounce_time` behavior is enabled. Must be provided as an a dictionary where the keys are the model names and the values are the thresholds. + debounce_time (float): The time (in seconds) to wait before returning another non-zero prediction + after a non-zero prediction. Can preven multiple detections of the same wake-word. timing (bool): Whether to return timing information of the models. Can be useful to debug and assess how efficiently models are running on the current hardware. @@ -333,16 +336,22 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi timing_dict["models"][mdl] = time.time() - model_start # Update scores based on thresholds or patience arguments - if patience != {}: + if patience != {} or debounce_time > 0: if threshold == {}: raise ValueError("Error! When using the `patience` argument, threshold " "values must be provided via the `threshold` argument!") + if patience != {} and debounce_time > 0: + raise ValueError("Error! The `patience` and `debounce_time` arguments cannot be used together!") for mdl in predictions.keys(): parent_model = self.get_parent_model_from_label(mdl) if parent_model in patience.keys(): scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:] if (scores >= threshold[parent_model]).sum() < patience[parent_model]: predictions[mdl] = 0.0 + if debounce_time > 0: + n_frames = int(debounce_time*1000/80) + if (np.array(self.prediction_buffer[mdl])[-n_frames:] >= threshold[parent_model]).sum() > 0: + predictions[mdl] = 0.0 # (optionally) get voice activity detection scores and update model scores if self.vad_threshold > 0: From 68e88c1350113a1e70afc8cb64e8d9d120db619f Mon Sep 17 00:00:00 2001 From: dscripka Date: Sun, 11 Feb 2024 12:08:03 -0500 Subject: [PATCH 2/4] Added/fixed reset methods --- openwakeword/model.py | 4 +++- openwakeword/utils.py | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/openwakeword/model.py b/openwakeword/model.py index 97d1303..8f2ef42 100755 --- a/openwakeword/model.py +++ b/openwakeword/model.py @@ -224,8 +224,10 @@ def get_parent_model_from_label(self, label): return parent_model def reset(self): - """Reset the prediction buffer""" + """Reset the prediction and audio feature buffers. Useful for re-initializing the model, though may not be efficient + when called too frequently.""" self.prediction_buffer = defaultdict(partial(deque, maxlen=30)) + self.preprocessor.reset() def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, debounce_time: float = 0.0, timing: bool = False): diff --git a/openwakeword/utils.py b/openwakeword/utils.py index 8da8048..4964706 100644 --- a/openwakeword/utils.py +++ b/openwakeword/utils.py @@ -160,7 +160,7 @@ def tflite_embedding_predict(x): self.embedding_model_predict = tflite_embedding_predict - # Create databuffers + # Create databuffers with empty/random data self.raw_data_buffer: Deque = deque(maxlen=sr*10) self.melspectrogram_buffer = np.ones((76, 32)) # n_frames x num_features self.melspectrogram_max_len = 10*97 # 97 is the number of frames in 1 second of 16hz audio @@ -169,6 +169,14 @@ def tflite_embedding_predict(x): self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16)) self.feature_buffer_max_len = 120 # ~10 seconds of feature buffer history + def reset(self): + """Reset the internal buffers""" + self.raw_data_buffer.clear() + self.melspectrogram_buffer = np.ones((76, 32)) + self.accumulated_samples = 0 + self.raw_data_remainder = np.empty(0) + self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16)) + def _get_melspectrogram(self, x: Union[np.ndarray, List], melspec_transform: Callable = lambda x: x/10 + 2): """ Function to compute the mel-spectrogram of the provided audio samples. From 528f4bff2cb78f3aae594c6da127a99f312ae5cf Mon Sep 17 00:00:00 2001 From: dscripka Date: Sun, 11 Feb 2024 12:45:59 -0500 Subject: [PATCH 3/4] tests for debounce functionality --- openwakeword/model.py | 27 +++++++++++++++++---------- tests/test_models.py | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/openwakeword/model.py b/openwakeword/model.py index 8f2ef42..6029963 100755 --- a/openwakeword/model.py +++ b/openwakeword/model.py @@ -327,11 +327,10 @@ def predict(self, x: np.ndarray, patience: dict = {}, )[0][-1] predictions[cls] = verifier_prediction - # Update prediction buffer, and zero predictions for first 5 frames during model initialization + # Zero predictions for first 5 frames during model initialization for cls in predictions.keys(): if len(self.prediction_buffer[cls]) < 5: predictions[cls] = 0.0 - self.prediction_buffer[cls].append(predictions[cls]) # Get timing information if timing: @@ -346,14 +345,22 @@ def predict(self, x: np.ndarray, patience: dict = {}, raise ValueError("Error! The `patience` and `debounce_time` arguments cannot be used together!") for mdl in predictions.keys(): parent_model = self.get_parent_model_from_label(mdl) - if parent_model in patience.keys(): - scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:] - if (scores >= threshold[parent_model]).sum() < patience[parent_model]: - predictions[mdl] = 0.0 - if debounce_time > 0: - n_frames = int(debounce_time*1000/80) - if (np.array(self.prediction_buffer[mdl])[-n_frames:] >= threshold[parent_model]).sum() > 0: - predictions[mdl] = 0.0 + if predictions[mdl] != 0.0: + if parent_model in patience.keys(): + scores = np.array(self.prediction_buffer[mdl])[-patience[parent_model]:] + if (scores >= threshold[parent_model]).sum() < patience[parent_model]: + predictions[mdl] = 0.0 + elif debounce_time > 0: + if parent_model in threshold.keys(): + n_frames = int(np.ceil(debounce_time/(n_prepared_samples/16000))) + recent_predictions = np.array(self.prediction_buffer[mdl])[-n_frames:] + if predictions[mdl] >= threshold[parent_model] and \ + (recent_predictions >= threshold[parent_model]).sum() > 0: + predictions[mdl] = 0.0 + + # Update prediction buffer + for mdl in predictions.keys(): + self.prediction_buffer[mdl].append(predictions[mdl]) # (optionally) get voice activity detection scores and update model scores if self.vad_threshold > 0: diff --git a/tests/test_models.py b/tests/test_models.py index e728065..fb6defd 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -208,6 +208,25 @@ def test_models_with_speex_noise_cancellation(self): ) assert 1 == 1 + def test_models_with_debounce(self): + # Load model with defaults + owwModel = openwakeword.Model() + + # Get test clip + os.path.join("tests", "data", "alexa_test.wav") + + # Predict with chunks of 1280 with and without debounce + predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), + debounce_time=0, threshold={"alexa_v0.1": 0.5}) + scores = np.array([i['alexa'] for i in predictions]) + + predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), + debounce_time=1.25, threshold={"alexa": 0.5}) + scores_with_debounce = np.array([i['alexa'] for i in predictions]) + print(scores, scores_with_debounce) + assert (scores >= 0.5).sum() > 1 + assert (scores_with_debounce >= 0.5).sum() == 1 + def test_models_with_vad(self): # Load model with defaults owwModel = openwakeword.Model(vad_threshold=0.5) From e9eade7aacb67154a1bde1b0e766f881eb73fcc6 Mon Sep 17 00:00:00 2001 From: dscripka Date: Sun, 11 Feb 2024 15:08:27 -0500 Subject: [PATCH 4/4] Added tests for reset methods --- tests/test_models.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index fb6defd..b3907ff 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -39,6 +39,7 @@ import pickle import tempfile import mock +import wave # Download models needed for tests openwakeword.utils.download_models() @@ -212,9 +213,6 @@ def test_models_with_debounce(self): # Load model with defaults owwModel = openwakeword.Model() - # Get test clip - os.path.join("tests", "data", "alexa_test.wav") - # Predict with chunks of 1280 with and without debounce predictions = owwModel.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), debounce_time=0, threshold={"alexa_v0.1": 0.5}) @@ -227,6 +225,32 @@ def test_models_with_debounce(self): assert (scores >= 0.5).sum() > 1 assert (scores_with_debounce >= 0.5).sum() == 1 + def test_model_reset(self): + # Load the model + owwModel = openwakeword.Model() + + # Get test clip and load it + clip = os.path.join("tests", "data", "alexa_test.wav") + with wave.open(clip, mode='rb') as f: + data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16) + + # Predict frame by frame + for i in range(0, len(data), 1280): + prediction = owwModel.predict(data[i:i+1280]) + if prediction['alexa'] > 0.5: + break + + # Assert that next prediction is still > 0.5 + prediction = owwModel.predict(data[i:i+1280]) + assert prediction['alexa'] > 0.5 + + # Reset the model + owwModel.reset() + + # Assert that next prediction is < 0.5 + prediction = owwModel.predict(data[i:i+1280]) + assert prediction['alexa'] < 0.5 + def test_models_with_vad(self): # Load model with defaults owwModel = openwakeword.Model(vad_threshold=0.5)