From 23b97679e5c35d1a4b468cfb449aa078ed6da02b Mon Sep 17 00:00:00 2001
From: Daniel McKnight <daniel@neon.ai>
Date: Tue, 17 Sep 2024 09:33:40 -0700
Subject: [PATCH] Add scalar confidence values as instance variables for skill
 customization Refactor confidence calculation to simplify logic, document
 process, and add logging Outline unit tests for confidence calculations

---
 ovos_workshop/skills/common_query_skill.py    | 52 ++++++++++++++-----
 .../skills/test_common_query_skill.py         | 11 +++-
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/ovos_workshop/skills/common_query_skill.py b/ovos_workshop/skills/common_query_skill.py
index fb77fc50..25fad4f9 100644
--- a/ovos_workshop/skills/common_query_skill.py
+++ b/ovos_workshop/skills/common_query_skill.py
@@ -34,6 +34,8 @@ class CQSMatchLevel(IntEnum):
                               [e.name for e in CQSMatchLevel])
 
 """these are for the confidence calculation"""
+# TODO: TOPIC_MATCH_RELEVANCE and RELEVANCE_MULTIPLIER stack on the same count of
+#   "relevant" words. This adds too much artificial confidence (>100%)
 # how much each topic word is worth
 # when found in the answer
 TOPIC_MATCH_RELEVANCE = 5
@@ -60,12 +62,18 @@ class CommonQuerySkill(OVOSSkill):
     """
 
     def __init__(self, *args, **kwargs):
-        # these should probably be configurable
+        # Confidence calculation numbers may be configured per-skill
         self.level_confidence = {
             CQSMatchLevel.EXACT: 0.9,
             CQSMatchLevel.CATEGORY: 0.6,
             CQSMatchLevel.GENERAL: 0.5
         }
+        self.relevance_multiplier = TOPIC_MATCH_RELEVANCE * RELEVANCE_MULTIPLIER
+        self.input_consumed_multiplier = 0.1
+        # TODO: The below defaults of 0.1 add ~25% for a 2-sentence response which is too much
+        self.response_sentences_multiplier = 0.1
+        self.response_words_multiplier = 1 / WORD_COUNT_DIVISOR
+
         super().__init__(*args, **kwargs)
 
         noise_words_filepath = f"text/{self.lang}/noise_words.list"
@@ -201,36 +209,52 @@ def __calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel,
         consumed_pct = len(match.split()) / len(phrase.split())
         if consumed_pct > 1.0:
             consumed_pct = 1.0
-        consumed_pct /= 10
 
-        # bonus for more sentences
-        num_sentences = float(float(len(answer.split("."))) / float(10))
+        # Approximate the number of sentences in the answer. A trailing `.` will
+        # split, so reduce length by 1. If no `.` is present, ensure we count
+        # any response as at least 1 sentence
+        num_sentences = min(len(answer.split(".")) - 1, 1)
 
-        # extract topic
+        # Remove articles and question words to approximate the meaningful part
+        # of what the skill extracted from the user input
         topic = self.remove_noise(match)
 
-        # calculate relevance
+        # Determine how many relevant words from the input are present in the
+        # answer
+        # TODO: Strip SSML from the answer here
         answer = answer.lower()
         matches = 0
         for word in topic.split(' '):
             if answer.find(word) > -1:
-                matches += TOPIC_MATCH_RELEVANCE
-
+                matches += 1
+        LOG.debug(f"Answer matched {matches} words")
         answer_size = len(answer.split(" "))
-        answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size)
 
+        # Calculate relevance as the percentage of relevant input words divided
+        # by the length of the response. This means that an answer that only
+        # contains the input will have a relevance value of 1
         relevance = 0.0
         if answer_size > 0:
             relevance = float(float(matches) / float(answer_size))
 
-        relevance = relevance * RELEVANCE_MULTIPLIER
+        # extra credit for more words up to a point. By default, 50 words is
+        # considered optimal
+        answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size)
 
-        # extra credit for more words up to a point
-        wc_mod = float(float(answer_size) / float(WORD_COUNT_DIVISOR)) * 2
+        # Calculate bonuses based on calculated values and configured weights
+        consumed_pct_bonus = consumed_pct * self.input_consumed_multiplier
+        num_sentences_bonus = num_sentences * self.response_sentences_multiplier
+        relevance_bonus = relevance * self.relevance_multiplier
+        word_count_bonus = answer_size * self.response_words_multiplier
 
+        LOG.debug(f"consumed_pct_bonus={consumed_pct_bonus}|num_sentence_bonus="
+                  f"{num_sentences_bonus}|relevance_bonus={relevance_bonus}|"
+                  f"word_count_bonus={word_count_bonus}")
         confidence = self.level_confidence[level] + \
-                     consumed_pct + num_sentences + relevance + wc_mod
-
+                     consumed_pct_bonus + num_sentences_bonus + relevance_bonus + word_count_bonus
+        if confidence > 1:
+            LOG.warning(f"Calculated confidence > 1.0: {confidence}")
+            return 1.0
         return confidence
 
     def __handle_query_classic(self, message):
diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py
index ef182e86..e85cc959 100644
--- a/test/unittests/skills/test_common_query_skill.py
+++ b/test/unittests/skills/test_common_query_skill.py
@@ -44,8 +44,15 @@ def test_remove_noise(self):
         pass
 
     def test_calc_confidence(self):
-        # TODO
-        pass
+        generic_q = "what is coke"
+        specific_q = "how much caffeine is in coca cola"
+        specific_q_2 = "what is the stock price for coca cola"
+
+        conf = self.skill._CommonQuerySkill__calc_confidence("diet coke", "what is diet coke", CQSMatchLevel.GENERAL,
+                                            "The drink diet coke has 32 milligrams of caffeine in 250 milliliters.</speak> Provided by CaffeineWiz.")
+        self.assertLessEqual(conf, 1.0)
+        print(conf)
+
 
     def test_handle_query_action(self):
         # TODO