From 23b97679e5c35d1a4b468cfb449aa078ed6da02b Mon Sep 17 00:00:00 2001 From: Daniel McKnight Date: Tue, 17 Sep 2024 09:33:40 -0700 Subject: [PATCH] Add scalar confidence values as instance variables for skill customization Refactor confidence calculation to simplify logic, document process, and add logging Outline unit tests for confidence calculations --- ovos_workshop/skills/common_query_skill.py | 52 ++++++++++++++----- .../skills/test_common_query_skill.py | 11 +++- 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/ovos_workshop/skills/common_query_skill.py b/ovos_workshop/skills/common_query_skill.py index fb77fc50..25fad4f9 100644 --- a/ovos_workshop/skills/common_query_skill.py +++ b/ovos_workshop/skills/common_query_skill.py @@ -34,6 +34,8 @@ class CQSMatchLevel(IntEnum): [e.name for e in CQSMatchLevel]) """these are for the confidence calculation""" +# TODO: TOPIC_MATCH_RELEVANCE and RELEVANCE_MULTIPLIER stack on the same count of +# "relevant" words. This adds too much artificial confidence (>100%) # how much each topic word is worth # when found in the answer TOPIC_MATCH_RELEVANCE = 5 @@ -60,12 +62,18 @@ class CommonQuerySkill(OVOSSkill): """ def __init__(self, *args, **kwargs): - # these should probably be configurable + # Confidence calculation numbers may be configured per-skill self.level_confidence = { CQSMatchLevel.EXACT: 0.9, CQSMatchLevel.CATEGORY: 0.6, CQSMatchLevel.GENERAL: 0.5 } + self.relevance_multiplier = TOPIC_MATCH_RELEVANCE * RELEVANCE_MULTIPLIER + self.input_consumed_multiplier = 0.1 + # TODO: The below defaults of 0.1 add ~25% for a 2-sentence response which is too much + self.response_sentences_multiplier = 0.1 + self.response_words_multiplier = 1 / WORD_COUNT_DIVISOR + super().__init__(*args, **kwargs) noise_words_filepath = f"text/{self.lang}/noise_words.list" @@ -201,36 +209,52 @@ def __calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel, consumed_pct = len(match.split()) / len(phrase.split()) if consumed_pct > 1.0: consumed_pct = 1.0 - consumed_pct /= 10 - # bonus for more sentences - num_sentences = float(float(len(answer.split("."))) / float(10)) + # Approximate the number of sentences in the answer. A trailing `.` will + # split, so reduce length by 1. If no `.` is present, ensure we count + # any response as at least 1 sentence + num_sentences = min(len(answer.split(".")) - 1, 1) - # extract topic + # Remove articles and question words to approximate the meaningful part + # of what the skill extracted from the user input topic = self.remove_noise(match) - # calculate relevance + # Determine how many relevant words from the input are present in the + # answer + # TODO: Strip SSML from the answer here answer = answer.lower() matches = 0 for word in topic.split(' '): if answer.find(word) > -1: - matches += TOPIC_MATCH_RELEVANCE - + matches += 1 + LOG.debug(f"Answer matched {matches} words") answer_size = len(answer.split(" ")) - answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size) + # Calculate relevance as the percentage of relevant input words divided + # by the length of the response. This means that an answer that only + # contains the input will have a relevance value of 1 relevance = 0.0 if answer_size > 0: relevance = float(float(matches) / float(answer_size)) - relevance = relevance * RELEVANCE_MULTIPLIER + # extra credit for more words up to a point. By default, 50 words is + # considered optimal + answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size) - # extra credit for more words up to a point - wc_mod = float(float(answer_size) / float(WORD_COUNT_DIVISOR)) * 2 + # Calculate bonuses based on calculated values and configured weights + consumed_pct_bonus = consumed_pct * self.input_consumed_multiplier + num_sentences_bonus = num_sentences * self.response_sentences_multiplier + relevance_bonus = relevance * self.relevance_multiplier + word_count_bonus = answer_size * self.response_words_multiplier + LOG.debug(f"consumed_pct_bonus={consumed_pct_bonus}|num_sentence_bonus=" + f"{num_sentences_bonus}|relevance_bonus={relevance_bonus}|" + f"word_count_bonus={word_count_bonus}") confidence = self.level_confidence[level] + \ - consumed_pct + num_sentences + relevance + wc_mod - + consumed_pct_bonus + num_sentences_bonus + relevance_bonus + word_count_bonus + if confidence > 1: + LOG.warning(f"Calculated confidence > 1.0: {confidence}") + return 1.0 return confidence def __handle_query_classic(self, message): diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py index ef182e86..e85cc959 100644 --- a/test/unittests/skills/test_common_query_skill.py +++ b/test/unittests/skills/test_common_query_skill.py @@ -44,8 +44,15 @@ def test_remove_noise(self): pass def test_calc_confidence(self): - # TODO - pass + generic_q = "what is coke" + specific_q = "how much caffeine is in coca cola" + specific_q_2 = "what is the stock price for coca cola" + + conf = self.skill._CommonQuerySkill__calc_confidence("diet coke", "what is diet coke", CQSMatchLevel.GENERAL, + "The drink diet coke has 32 milligrams of caffeine in 250 milliliters. Provided by CaffeineWiz.") + self.assertLessEqual(conf, 1.0) + print(conf) + def test_handle_query_action(self): # TODO