Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable confidence calculations with unit tests #234

Open
wants to merge 6 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 47 additions & 19 deletions ovos_workshop/skills/common_query_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,16 @@
class CQSMatchLevel(IntEnum):
EXACT = 1 # Skill could find a specific answer for the question
CATEGORY = 2 # Skill could find an answer from a category in the query
GENERAL = 3 # The query could be processed as a general quer
GENERAL = 3 # The query could be processed as a general query


# Copy of CQSMatchLevel to use if the skill returns visual media
CQSVisualMatchLevel = IntEnum('CQSVisualMatchLevel',
[e.name for e in CQSMatchLevel])

"""these are for the confidence calculation"""
# TODO: TOPIC_MATCH_RELEVANCE and RELEVANCE_MULTIPLIER stack on the same count of
# "relevant" words. This adds too much artificial confidence (>100%)
# how much each topic word is worth
# when found in the answer
TOPIC_MATCH_RELEVANCE = 5
Expand All @@ -60,12 +62,18 @@ class CommonQuerySkill(OVOSSkill):
"""

def __init__(self, *args, **kwargs):
# these should probably be configurable
# Confidence calculation numbers may be configured per-skill
self.level_confidence = {
CQSMatchLevel.EXACT: 0.9,
CQSMatchLevel.CATEGORY: 0.6,
CQSMatchLevel.GENERAL: 0.5
}
self.relevance_multiplier = TOPIC_MATCH_RELEVANCE * RELEVANCE_MULTIPLIER
self.input_consumed_multiplier = 0.1
# TODO: The below defaults of 0.1 add ~25% for a 2-sentence response which is too much
self.response_sentences_multiplier = 0.1
self.response_words_multiplier = 1 / WORD_COUNT_DIVISOR

super().__init__(*args, **kwargs)

noise_words_filepath = f"text/{self.lang}/noise_words.list"
Expand Down Expand Up @@ -142,7 +150,10 @@ def __handle_question_query(self, message: Message):
level = result[1]
answer = result[2]
callback = result[3] if len(result) > 3 else {}
confidence = self.__calc_confidence(match, search_phrase, level, answer)
confidence = self.calc_confidence(match, search_phrase, level, answer)
if confidence > 1.0:
LOG.warning(f"Calculated confidence {confidence} > 1.0")
confidence = 1.0
callback["answer"] = answer # ensure we get it back in CQS_action
self.bus.emit(message.response({"phrase": search_phrase,
"skill_id": self.skill_id,
Expand Down Expand Up @@ -187,10 +198,11 @@ def remove_noise(self, phrase: str, lang: str = None) -> str:
phrase = ' '.join(phrase.split())
return phrase.strip()

def __calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel,
answer: str) -> float:
def calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel,
JarbasAl marked this conversation as resolved.
Show resolved Hide resolved
answer: str) -> float:
"""
Calculate a confidence level for the skill response.
Calculate a confidence level for the skill response. Skills may override
this method to implement custom confidence calculation
@param match: Matched portion of the input phrase
@param phrase: User input phrase that was evaluated
@param level: Skill-determined match level of the answer
Expand All @@ -201,36 +213,52 @@ def __calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel,
consumed_pct = len(match.split()) / len(phrase.split())
if consumed_pct > 1.0:
consumed_pct = 1.0
consumed_pct /= 10

# bonus for more sentences
num_sentences = float(float(len(answer.split("."))) / float(10))
# Approximate the number of sentences in the answer. A trailing `.` will
# split, so reduce length by 1. If no `.` is present, ensure we count
# any response as at least 1 sentence
num_sentences = min(len(answer.split(".")) - 1, 1)
Copy link
Member

@JarbasAl JarbasAl Sep 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not very good, should use quebra_frases instead (already a dependency)

print(quebra_frases.sentence_tokenize(
    "Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it. Did he mind? Adam Jones Jr. thinks he didn't. In any case, this isn't true... Well, with a probability of .9 it isn't."))
#['Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it.',
#'Did he mind?',
#"Adam Jones Jr. thinks he didn't.",
#"In any case, this isn't true...",
#"Well, with a probability of .9 it isn't."]


# extract topic
# Remove articles and question words to approximate the meaningful part
# of what the skill extracted from the user input
topic = self.remove_noise(match)

# calculate relevance
# Determine how many relevant words from the input are present in the
# answer
# TODO: Strip SSML from the answer here
answer = answer.lower()
matches = 0
for word in topic.split(' '):
if answer.find(word) > -1:
matches += TOPIC_MATCH_RELEVANCE

matches += 1
LOG.debug(f"Answer matched {matches} words")
answer_size = len(answer.split(" "))
answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size)

# Calculate relevance as the percentage of relevant input words divided
# by the length of the response. This means that an answer that only
# contains the input will have a relevance value of 1
relevance = 0.0
if answer_size > 0:
relevance = float(float(matches) / float(answer_size))

relevance = relevance * RELEVANCE_MULTIPLIER
# extra credit for more words up to a point. By default, 50 words is
# considered optimal
answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size)

# extra credit for more words up to a point
wc_mod = float(float(answer_size) / float(WORD_COUNT_DIVISOR)) * 2
# Calculate bonuses based on calculated values and configured weights
consumed_pct_bonus = consumed_pct * self.input_consumed_multiplier
num_sentences_bonus = num_sentences * self.response_sentences_multiplier
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be part of the score at all? its a voice assistant, do we prefer a skill reading a full wikipedia page vs giving a straight answer?

relevance_bonus = relevance * self.relevance_multiplier
word_count_bonus = answer_size * self.response_words_multiplier

LOG.debug(f"consumed_pct_bonus={consumed_pct_bonus}|num_sentence_bonus="
f"{num_sentences_bonus}|relevance_bonus={relevance_bonus}|"
f"word_count_bonus={word_count_bonus}")
confidence = self.level_confidence[level] + \
consumed_pct + num_sentences + relevance + wc_mod

consumed_pct_bonus + num_sentences_bonus + relevance_bonus + word_count_bonus
if confidence > 1:
LOG.warning(f"Calculated confidence > 1.0: {confidence}")
return 1.0
return confidence

def __handle_query_classic(self, message):
Expand Down
37 changes: 31 additions & 6 deletions test/unittests/skills/test_common_query_skill.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unittest import TestCase
from unittest.mock import Mock

from ovos_utils.messagebus import FakeBus
from ovos_workshop.skills.base import BaseSkill
Expand Down Expand Up @@ -36,16 +37,40 @@ def test_handle_question_query(self):
pass

def test_get_cq(self):
# TODO
pass
test_phrase = "test"
mock_return = Mock()
self.skill.CQS_match_query_phrase = Mock(return_value=mock_return)
result = self.skill._CommonQuerySkill__get_cq(test_phrase)
self.skill.CQS_match_query_phrase .assert_called_once_with(test_phrase)
self.assertEqual(result, mock_return)

self.skill.CQS_match_query_phrase.side_effect = Exception()
result = self.skill._CommonQuerySkill__get_cq(test_phrase)
self.assertIsNone(result)

def test_remove_noise(self):
# TODO
pass
noisy_match = "what is a computer"
normalized = "computer"
self.assertEqual(self.skill.remove_noise(noisy_match), normalized)

def test_calc_confidence(self):
# TODO
pass
generic_q = "what is coca cola"
specific_q = "how much caffeine is in coca cola"
specific_q_2 = "what is the stock price for coca cola"
cw_answer = ("The drink diet coke has 32 milligrams of caffeine in "
"250 milliliters.</speak> Provided by CaffeineWiz.")

generic_conf = self.skill.calc_confidence("coca cola", generic_q,
CQSMatchLevel.GENERAL,
cw_answer)
exact_conf = self.skill.calc_confidence("coca cola", specific_q,
CQSMatchLevel.EXACT, cw_answer)
low_conf = self.skill.calc_confidence("coca cola", specific_q_2,
CQSMatchLevel.GENERAL, cw_answer)

self.assertEqual(exact_conf, 1.0)
self.assertLess(generic_conf, exact_conf)
self.assertLess(low_conf, generic_conf)

def test_handle_query_action(self):
# TODO
Expand Down
Loading