From 23b97679e5c35d1a4b468cfb449aa078ed6da02b Mon Sep 17 00:00:00 2001 From: Daniel McKnight Date: Tue, 17 Sep 2024 09:33:40 -0700 Subject: [PATCH 1/6] Add scalar confidence values as instance variables for skill customization Refactor confidence calculation to simplify logic, document process, and add logging Outline unit tests for confidence calculations --- ovos_workshop/skills/common_query_skill.py | 52 ++++++++++++++----- .../skills/test_common_query_skill.py | 11 +++- 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/ovos_workshop/skills/common_query_skill.py b/ovos_workshop/skills/common_query_skill.py index fb77fc50..25fad4f9 100644 --- a/ovos_workshop/skills/common_query_skill.py +++ b/ovos_workshop/skills/common_query_skill.py @@ -34,6 +34,8 @@ class CQSMatchLevel(IntEnum): [e.name for e in CQSMatchLevel]) """these are for the confidence calculation""" +# TODO: TOPIC_MATCH_RELEVANCE and RELEVANCE_MULTIPLIER stack on the same count of +# "relevant" words. This adds too much artificial confidence (>100%) # how much each topic word is worth # when found in the answer TOPIC_MATCH_RELEVANCE = 5 @@ -60,12 +62,18 @@ class CommonQuerySkill(OVOSSkill): """ def __init__(self, *args, **kwargs): - # these should probably be configurable + # Confidence calculation numbers may be configured per-skill self.level_confidence = { CQSMatchLevel.EXACT: 0.9, CQSMatchLevel.CATEGORY: 0.6, CQSMatchLevel.GENERAL: 0.5 } + self.relevance_multiplier = TOPIC_MATCH_RELEVANCE * RELEVANCE_MULTIPLIER + self.input_consumed_multiplier = 0.1 + # TODO: The below defaults of 0.1 add ~25% for a 2-sentence response which is too much + self.response_sentences_multiplier = 0.1 + self.response_words_multiplier = 1 / WORD_COUNT_DIVISOR + super().__init__(*args, **kwargs) noise_words_filepath = f"text/{self.lang}/noise_words.list" @@ -201,36 +209,52 @@ def __calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel, consumed_pct = len(match.split()) / len(phrase.split()) if consumed_pct > 1.0: consumed_pct = 1.0 - consumed_pct /= 10 - # bonus for more sentences - num_sentences = float(float(len(answer.split("."))) / float(10)) + # Approximate the number of sentences in the answer. A trailing `.` will + # split, so reduce length by 1. If no `.` is present, ensure we count + # any response as at least 1 sentence + num_sentences = min(len(answer.split(".")) - 1, 1) - # extract topic + # Remove articles and question words to approximate the meaningful part + # of what the skill extracted from the user input topic = self.remove_noise(match) - # calculate relevance + # Determine how many relevant words from the input are present in the + # answer + # TODO: Strip SSML from the answer here answer = answer.lower() matches = 0 for word in topic.split(' '): if answer.find(word) > -1: - matches += TOPIC_MATCH_RELEVANCE - + matches += 1 + LOG.debug(f"Answer matched {matches} words") answer_size = len(answer.split(" ")) - answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size) + # Calculate relevance as the percentage of relevant input words divided + # by the length of the response. This means that an answer that only + # contains the input will have a relevance value of 1 relevance = 0.0 if answer_size > 0: relevance = float(float(matches) / float(answer_size)) - relevance = relevance * RELEVANCE_MULTIPLIER + # extra credit for more words up to a point. By default, 50 words is + # considered optimal + answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size) - # extra credit for more words up to a point - wc_mod = float(float(answer_size) / float(WORD_COUNT_DIVISOR)) * 2 + # Calculate bonuses based on calculated values and configured weights + consumed_pct_bonus = consumed_pct * self.input_consumed_multiplier + num_sentences_bonus = num_sentences * self.response_sentences_multiplier + relevance_bonus = relevance * self.relevance_multiplier + word_count_bonus = answer_size * self.response_words_multiplier + LOG.debug(f"consumed_pct_bonus={consumed_pct_bonus}|num_sentence_bonus=" + f"{num_sentences_bonus}|relevance_bonus={relevance_bonus}|" + f"word_count_bonus={word_count_bonus}") confidence = self.level_confidence[level] + \ - consumed_pct + num_sentences + relevance + wc_mod - + consumed_pct_bonus + num_sentences_bonus + relevance_bonus + word_count_bonus + if confidence > 1: + LOG.warning(f"Calculated confidence > 1.0: {confidence}") + return 1.0 return confidence def __handle_query_classic(self, message): diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py index ef182e86..e85cc959 100644 --- a/test/unittests/skills/test_common_query_skill.py +++ b/test/unittests/skills/test_common_query_skill.py @@ -44,8 +44,15 @@ def test_remove_noise(self): pass def test_calc_confidence(self): - # TODO - pass + generic_q = "what is coke" + specific_q = "how much caffeine is in coca cola" + specific_q_2 = "what is the stock price for coca cola" + + conf = self.skill._CommonQuerySkill__calc_confidence("diet coke", "what is diet coke", CQSMatchLevel.GENERAL, + "The drink diet coke has 32 milligrams of caffeine in 250 milliliters. Provided by CaffeineWiz.") + self.assertLessEqual(conf, 1.0) + print(conf) + def test_handle_query_action(self): # TODO From 08f429e3c29113ec938a7af2d2eb2b833ecb66ed Mon Sep 17 00:00:00 2001 From: Daniel McKnight Date: Tue, 17 Sep 2024 09:54:24 -0700 Subject: [PATCH 2/6] Make `calc_confidence` a normal method for skills to override --- ovos_workshop/skills/common_query_skill.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ovos_workshop/skills/common_query_skill.py b/ovos_workshop/skills/common_query_skill.py index 25fad4f9..b481335c 100644 --- a/ovos_workshop/skills/common_query_skill.py +++ b/ovos_workshop/skills/common_query_skill.py @@ -150,7 +150,10 @@ def __handle_question_query(self, message: Message): level = result[1] answer = result[2] callback = result[3] if len(result) > 3 else {} - confidence = self.__calc_confidence(match, search_phrase, level, answer) + confidence = self.calc_confidence(match, search_phrase, level, answer) + if confidence > 1.0: + LOG.warning(f"Calculated confidence {confidence} > 1.0") + confidence = 1.0 callback["answer"] = answer # ensure we get it back in CQS_action self.bus.emit(message.response({"phrase": search_phrase, "skill_id": self.skill_id, @@ -195,10 +198,11 @@ def remove_noise(self, phrase: str, lang: str = None) -> str: phrase = ' '.join(phrase.split()) return phrase.strip() - def __calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel, - answer: str) -> float: + def calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel, + answer: str) -> float: """ - Calculate a confidence level for the skill response. + Calculate a confidence level for the skill response. Skills may override + this method to implement custom confidence calculation @param match: Matched portion of the input phrase @param phrase: User input phrase that was evaluated @param level: Skill-determined match level of the answer From fab9c4267c5ebef4b6652a7b9fa419196b476d85 Mon Sep 17 00:00:00 2001 From: Daniel McKnight Date: Tue, 17 Sep 2024 10:15:48 -0700 Subject: [PATCH 3/6] Add unit test coverage for confidence calculations --- ovos_workshop/skills/common_query_skill.py | 2 +- .../skills/test_common_query_skill.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/ovos_workshop/skills/common_query_skill.py b/ovos_workshop/skills/common_query_skill.py index b481335c..024f2617 100644 --- a/ovos_workshop/skills/common_query_skill.py +++ b/ovos_workshop/skills/common_query_skill.py @@ -26,7 +26,7 @@ class CQSMatchLevel(IntEnum): EXACT = 1 # Skill could find a specific answer for the question CATEGORY = 2 # Skill could find an answer from a category in the query - GENERAL = 3 # The query could be processed as a general quer + GENERAL = 3 # The query could be processed as a general query # Copy of CQSMatchLevel to use if the skill returns visual media diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py index e85cc959..31d948b2 100644 --- a/test/unittests/skills/test_common_query_skill.py +++ b/test/unittests/skills/test_common_query_skill.py @@ -44,15 +44,23 @@ def test_remove_noise(self): pass def test_calc_confidence(self): - generic_q = "what is coke" + generic_q = "what is coca cola" specific_q = "how much caffeine is in coca cola" specific_q_2 = "what is the stock price for coca cola" + cw_answer = ("The drink diet coke has 32 milligrams of caffeine in " + "250 milliliters. Provided by CaffeineWiz.") - conf = self.skill._CommonQuerySkill__calc_confidence("diet coke", "what is diet coke", CQSMatchLevel.GENERAL, - "The drink diet coke has 32 milligrams of caffeine in 250 milliliters. Provided by CaffeineWiz.") - self.assertLessEqual(conf, 1.0) - print(conf) + generic_conf = self.skill.calc_confidence("coca cola", generic_q, + CQSMatchLevel.GENERAL, + cw_answer) + exact_conf = self.skill.calc_confidence("coca cola", specific_q, + CQSMatchLevel.EXACT, cw_answer) + low_conf = self.skill.calc_confidence("coca cola", specific_q_2, + CQSMatchLevel.GENERAL, cw_answer) + self.assertEqual(exact_conf, 1.0) + self.assertLess(generic_conf, exact_conf) + self.assertLess(low_conf, generic_conf) def test_handle_query_action(self): # TODO From 4bdf55fcb89348c6f5f4e811bd2d3b3dd029174d Mon Sep 17 00:00:00 2001 From: Daniel McKnight Date: Tue, 17 Sep 2024 10:50:03 -0700 Subject: [PATCH 4/6] Add unit test coverage for `remove_noise` to address codecov automation --- test/unittests/skills/test_common_query_skill.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py index 31d948b2..3f5c5f1c 100644 --- a/test/unittests/skills/test_common_query_skill.py +++ b/test/unittests/skills/test_common_query_skill.py @@ -40,8 +40,9 @@ def test_get_cq(self): pass def test_remove_noise(self): - # TODO - pass + noisy_match = "what is a computer" + normalized = "computer" + self.assertEqual(self.skill.remove_noise(noisy_match), normalized) def test_calc_confidence(self): generic_q = "what is coca cola" From 7d303403724ef1a041f7210a5bbfb67769c1f192 Mon Sep 17 00:00:00 2001 From: Daniel McKnight Date: Tue, 17 Sep 2024 11:10:50 -0700 Subject: [PATCH 5/6] Add unit test coverage for `__get_cq` to address codecov automation --- test/unittests/skills/test_common_query_skill.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py index 3f5c5f1c..d5ab7170 100644 --- a/test/unittests/skills/test_common_query_skill.py +++ b/test/unittests/skills/test_common_query_skill.py @@ -1,4 +1,5 @@ from unittest import TestCase +from unittest.mock import Mock from ovos_utils.messagebus import FakeBus from ovos_workshop.skills.base import BaseSkill @@ -36,8 +37,16 @@ def test_handle_question_query(self): pass def test_get_cq(self): - # TODO - pass + test_phrase = "test" + mock_return = Mock() + self.skill.CQS_match_query_phrase = Mock(return_value=mock_return) + result = self.skill._CommonQuerySkill__get_cq(test_phrase) + self.skill.CQS_match_query_phrase .assert_called_once_with(test_phrase) + self.assertEqual(result, mock_return) + + self.skill.CQS_match_query_phrase.side_effect = Exception() + result = self.skill._CommonQuerySkill__get_cq(test_phrase) + self.assertIsNone(result) def test_remove_noise(self): noisy_match = "what is a computer" From 588c727eec2b6ce11bd0c981fc8b7927e31c1f9b Mon Sep 17 00:00:00 2001 From: Daniel McKnight Date: Tue, 17 Sep 2024 15:01:20 -0700 Subject: [PATCH 6/6] Revert `calc_confidence` public method change Support returned float confidence level in addition to enum --- ovos_workshop/skills/common_query_skill.py | 23 +++++++++++-------- .../skills/test_common_query_skill.py | 15 ++++++------ 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/ovos_workshop/skills/common_query_skill.py b/ovos_workshop/skills/common_query_skill.py index 024f2617..80f2e3ed 100644 --- a/ovos_workshop/skills/common_query_skill.py +++ b/ovos_workshop/skills/common_query_skill.py @@ -13,7 +13,7 @@ from abc import abstractmethod from enum import IntEnum from os.path import dirname -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Union from ovos_bus_client import Message from ovos_utils.file_utils import resolve_resource_file @@ -150,7 +150,13 @@ def __handle_question_query(self, message: Message): level = result[1] answer = result[2] callback = result[3] if len(result) > 3 else {} - confidence = self.calc_confidence(match, search_phrase, level, answer) + if isinstance(level, float): + LOG.debug(f"Confidence directly reported by skill") + confidence = level + else: + LOG.info(f"Calculating confidence for level {level}") + confidence = self.__calc_confidence(match, search_phrase, level, + answer) if confidence > 1.0: LOG.warning(f"Calculated confidence {confidence} > 1.0") confidence = 1.0 @@ -167,8 +173,8 @@ def __handle_question_query(self, message: Message): "skill_id": self.skill_id, "searching": False})) - def __get_cq(self, search_phrase: str) -> (str, CQSMatchLevel, str, - Optional[dict]): + def __get_cq(self, search_phrase: str) -> (str, Union[CQSMatchLevel, float], + str, Optional[dict]): """ Invoke the CQS handler to let the skill perform its search @param search_phrase: parsed question to get an answer for @@ -198,11 +204,10 @@ def remove_noise(self, phrase: str, lang: str = None) -> str: phrase = ' '.join(phrase.split()) return phrase.strip() - def calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel, - answer: str) -> float: + def __calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel, + answer: str) -> float: """ - Calculate a confidence level for the skill response. Skills may override - this method to implement custom confidence calculation + Calculate a confidence level for the skill response. @param match: Matched portion of the input phrase @param phrase: User input phrase that was evaluated @param level: Skill-determined match level of the answer @@ -298,7 +303,7 @@ def __handle_query_action(self, message: Message): @abstractmethod def CQS_match_query_phrase(self, phrase: str) -> \ - Optional[Tuple[str, CQSMatchLevel, Optional[dict]]]: + Optional[Tuple[str, Union[CQSMatchLevel, float], Optional[dict]]]: """ Determine an answer to the input phrase and return match information, or `None` if no answer can be determined. diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py index d5ab7170..a9eff418 100644 --- a/test/unittests/skills/test_common_query_skill.py +++ b/test/unittests/skills/test_common_query_skill.py @@ -41,7 +41,7 @@ def test_get_cq(self): mock_return = Mock() self.skill.CQS_match_query_phrase = Mock(return_value=mock_return) result = self.skill._CommonQuerySkill__get_cq(test_phrase) - self.skill.CQS_match_query_phrase .assert_called_once_with(test_phrase) + self.skill.CQS_match_query_phrase.assert_called_once_with(test_phrase) self.assertEqual(result, mock_return) self.skill.CQS_match_query_phrase.side_effect = Exception() @@ -60,13 +60,12 @@ def test_calc_confidence(self): cw_answer = ("The drink diet coke has 32 milligrams of caffeine in " "250 milliliters. Provided by CaffeineWiz.") - generic_conf = self.skill.calc_confidence("coca cola", generic_q, - CQSMatchLevel.GENERAL, - cw_answer) - exact_conf = self.skill.calc_confidence("coca cola", specific_q, - CQSMatchLevel.EXACT, cw_answer) - low_conf = self.skill.calc_confidence("coca cola", specific_q_2, - CQSMatchLevel.GENERAL, cw_answer) + generic_conf = self.skill._CommonQuerySkill__calc_confidence( + "coca cola", generic_q, CQSMatchLevel.GENERAL, cw_answer) + exact_conf = self.skill._CommonQuerySkill__calc_confidence( + "coca cola", specific_q, CQSMatchLevel.EXACT, cw_answer) + low_conf = self.skill._CommonQuerySkill__calc_confidence( + "coca cola", specific_q_2, CQSMatchLevel.GENERAL, cw_answer) self.assertEqual(exact_conf, 1.0) self.assertLess(generic_conf, exact_conf)