From 59976334eb0f2d5250bd517de9f94c3d5b01eac2 Mon Sep 17 00:00:00 2001 From: miro Date: Wed, 16 Oct 2024 00:40:16 +0100 Subject: [PATCH 1/6] fix:standardize_lang --- ocp_pipeline/opm.py | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/ocp_pipeline/opm.py b/ocp_pipeline/opm.py index 273b8f3..19a2d3e 100644 --- a/ocp_pipeline/opm.py +++ b/ocp_pipeline/opm.py @@ -12,6 +12,7 @@ from ovos_bus_client.session import SessionManager from ovos_plugin_manager.ocp import available_extractors from ovos_plugin_manager.templates.pipeline import IntentMatch, PipelinePlugin +from ovos_utils.lang import standardize_lang_tag from ovos_utils.log import LOG from ovos_utils.messagebus import FakeBus from ovos_utils.ocp import MediaType, PlaybackType, PlaybackMode, PlayerState, OCP_ID, \ @@ -102,6 +103,7 @@ def load_classifiers(self): def load_resource_files(self): intents = {} for lang in self.native_langs: + lang = str(standardize_lang_tag(lang)) intents[lang] = {} locale_folder = join(dirname(__file__), "locale", lang) for f in os.listdir(locale_folder): @@ -138,6 +140,7 @@ def register_ocp_intents(self): intent_files = self.load_resource_files() for lang, intent_data in intent_files.items(): + lang = standardize_lang_tag(lang) self.intent_matchers[lang] = IntentContainer() for intent_name in self.intents: samples = intent_data.get(intent_name) @@ -286,6 +289,8 @@ def handle_player_state_update(self, message: Message): def match_high(self, utterances: List[str], lang: str, message: Message = None) -> Optional[IntentMatch]: """ exact matches only, handles playback control recommended after high confidence intents pipeline stage """ + lang = standardize_lang_tag(lang) + # TODO - allow close langs, match dialects if lang not in self.intent_matchers: return None @@ -327,6 +332,8 @@ def match_high(self, utterances: List[str], lang: str, message: Message = None) def match_medium(self, utterances: List[str], lang: str, message: Message = None) -> Optional[IntentMatch]: """ match a utterance via classifiers, recommended before common_qa pipeline stage""" + lang = standardize_lang_tag(lang) + utterance = utterances[0].lower() # is this a OCP query ? is_ocp, bconf = self.is_ocp_query(utterance, lang) @@ -368,6 +375,8 @@ def match_fallback(self, utterances: List[str], lang: str, message: Message = No if not ents: return None + lang = standardize_lang_tag(lang) + # classify the query media type media_type, confidence = self.classify_media(utterance, lang) @@ -388,7 +397,7 @@ def match_fallback(self, utterances: List[str], lang: str, message: Message = No def _process_play_query(self, utterance: str, lang: str, match: dict = None, message: Optional[Message] = None) -> Optional[IntentMatch]: - + lang = standardize_lang_tag(lang) match = match or {} player = self.get_player(message) # if media is currently paused, empty string means "resume playback" @@ -455,6 +464,7 @@ def handle_search_query(self, message: Message): if num: phrase += " " + num + lang = standardize_lang_tag(lang) # classify the query media type media_type, prob = self.classify_media(utterance, lang) # search common play skills @@ -503,6 +513,7 @@ def handle_play_intent(self, message: Message): skills = message.data.get("skills", []) # search common play skills + lang = standardize_lang_tag(lang) results = self._search(query, media_type, lang, skills=skills, message=message) @@ -613,6 +624,7 @@ def handle_search_error_intent(self, message: Message): # NLP def voc_match_media(self, query: str, lang: str) -> Tuple[MediaType, float]: + lang = standardize_lang_tag(lang) # simplistic approach via voc_match, works anywhere # and it's easy to localize, but isn't very accurate if self.voc_match(query, "MusicKeyword", lang=lang): @@ -674,6 +686,7 @@ def voc_match_media(self, query: str, lang: str) -> Tuple[MediaType, float]: def classify_media(self, query: str, lang: str) -> Tuple[MediaType, float]: """ determine what media type is being requested """ + lang = standardize_lang_tag(lang) # using a trained classifier (Experimental) if self.config.get("experimental_media_classifier", False): from ovos_classifiers.skovos.classifier import SklearnOVOSClassifier @@ -701,6 +714,7 @@ def classify_media(self, query: str, lang: str) -> Tuple[MediaType, float]: def is_ocp_query(self, query: str, lang: str) -> Tuple[bool, float]: """ determine if a playback question is being asked""" + lang = standardize_lang_tag(lang) if self.config.get("experimental_binary_classifier", False): from ovos_classifiers.skovos.classifier import SklearnOVOSClassifier try: @@ -731,6 +745,7 @@ def _should_resume(self, phrase: str, lang: str, message: Optional[Message] = No @param phrase: Extracted playback phrase @return: True if player should resume, False if this is a new request """ + lang = standardize_lang_tag(lang) player = self.get_player(message) if player.player_state == PlayerState.PAUSED: if not phrase.strip() or \ @@ -782,6 +797,7 @@ def normalize_results(self, results: list) -> List[Union[MediaEntry, Playlist, P def filter_results(self, results: list, phrase: str, lang: str, media_type: MediaType = MediaType.GENERIC, message: Optional[Message] = None) -> list: + lang = standardize_lang_tag(lang) # ignore very low score matches l1 = len(results) results = [r for r in results @@ -1031,19 +1047,22 @@ def match_legacy(self, utterances: List[str], lang: str, message: Message = None utterance = utterances[0].lower() - match = self.intent_matchers[lang].calc_intent(utterance) + lang = standardize_lang_tag(lang) + # TODO - allow close langs, match dialects + if lang in self.intent_matchers: + match = self.intent_matchers[lang].calc_intent(utterance) - if match["name"] is None: - return None - if match["name"] == "play": - LOG.info(f"Legacy Mycroft CommonPlay match: {match}") - utterance = match["entities"].pop("query") - return IntentMatch(intent_service="OCP_media", - intent_type="ocp:legacy_cps", - intent_data={"query": utterance, - "conf": 0.7}, - skill_id=OCP_ID, - utterance=utterance) + if match["name"] is None: + return None + if match["name"] == "play": + LOG.info(f"Legacy Mycroft CommonPlay match: {match}") + utterance = match["entities"].pop("query") + return IntentMatch(intent_service="OCP_media", + intent_type="ocp:legacy_cps", + intent_data={"query": utterance, + "conf": 0.7}, + skill_id=OCP_ID, + utterance=utterance) def handle_legacy_cps(self, message: Message): """intent handler for legacy CPS matches""" From 3f73cbe05e88e5ee04ab7a2e6ca382c967fd9314 Mon Sep 17 00:00:00 2001 From: miro Date: Wed, 16 Oct 2024 01:11:35 +0100 Subject: [PATCH 2/6] fix:standardize_lang --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ebc59f3..6aad041 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ ovos-workshop>=0.1.7,<2.0.0 -ovos-classifiers \ No newline at end of file +ovos-classifiers +ovos-utils>=0.3.4,<1.0.0 \ No newline at end of file From 362e2df8f927df679e544d081ccc577abc0ca09f Mon Sep 17 00:00:00 2001 From: JarbasAI <33701864+JarbasAl@users.noreply.github.com> Date: Wed, 16 Oct 2024 01:13:54 +0100 Subject: [PATCH 3/6] Update ocp_pipeline/opm.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- ocp_pipeline/opm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocp_pipeline/opm.py b/ocp_pipeline/opm.py index 19a2d3e..1c8dfd3 100644 --- a/ocp_pipeline/opm.py +++ b/ocp_pipeline/opm.py @@ -103,7 +103,7 @@ def load_classifiers(self): def load_resource_files(self): intents = {} for lang in self.native_langs: - lang = str(standardize_lang_tag(lang)) + lang = standardize_lang_tag(lang) intents[lang] = {} locale_folder = join(dirname(__file__), "locale", lang) for f in os.listdir(locale_folder): From 0dd9e89fb2aa707a3463bd54c7629bfb4b10df21 Mon Sep 17 00:00:00 2001 From: miro Date: Wed, 16 Oct 2024 01:16:42 +0100 Subject: [PATCH 4/6] fix:standardize_lang --- ocp_pipeline/opm.py | 50 ++++++++++++++++++++++++++++----------------- requirements.txt | 3 ++- 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/ocp_pipeline/opm.py b/ocp_pipeline/opm.py index 1c8dfd3..e5c9893 100644 --- a/ocp_pipeline/opm.py +++ b/ocp_pipeline/opm.py @@ -19,7 +19,7 @@ MediaEntry, Playlist, MediaState, TrackState, dict2entry, PluginStream from ovos_workshop.app import OVOSAbstractApplication from padacioso import IntentContainer - +from langcodes import closest_match from ocp_pipeline.feats import OCPFeaturizer from ocp_pipeline.legacy import LegacyCommonPlay @@ -289,9 +289,8 @@ def handle_player_state_update(self, message: Message): def match_high(self, utterances: List[str], lang: str, message: Message = None) -> Optional[IntentMatch]: """ exact matches only, handles playback control recommended after high confidence intents pipeline stage """ - lang = standardize_lang_tag(lang) - # TODO - allow close langs, match dialects - if lang not in self.intent_matchers: + lang = self._get_closest_lang(lang) + if lang is None: # no intents registered for this lang return None self.bus.emit(Message("ovos.common_play.status")) # sync @@ -1047,22 +1046,35 @@ def match_legacy(self, utterances: List[str], lang: str, message: Message = None utterance = utterances[0].lower() - lang = standardize_lang_tag(lang) - # TODO - allow close langs, match dialects - if lang in self.intent_matchers: - match = self.intent_matchers[lang].calc_intent(utterance) + lang = self._get_closest_lang(lang) + if lang is None: # no intents registered for this lang + return None - if match["name"] is None: - return None - if match["name"] == "play": - LOG.info(f"Legacy Mycroft CommonPlay match: {match}") - utterance = match["entities"].pop("query") - return IntentMatch(intent_service="OCP_media", - intent_type="ocp:legacy_cps", - intent_data={"query": utterance, - "conf": 0.7}, - skill_id=OCP_ID, - utterance=utterance) + match = self.intent_matchers[lang].calc_intent(utterance) + + if match["name"] is None: + return None + if match["name"] == "play": + LOG.info(f"Legacy Mycroft CommonPlay match: {match}") + utterance = match["entities"].pop("query") + return IntentMatch(intent_service="OCP_media", + intent_type="ocp:legacy_cps", + intent_data={"query": utterance, + "conf": 0.7}, + skill_id=OCP_ID, + utterance=utterance) + + def _get_closest_lang(self, lang: str) -> Optional[str]: + if self.intent_matchers: + lang = standardize_lang_tag(lang) + closest, score = closest_match(lang, list(self.intent_matchers.keys())) + # https://langcodes-hickford.readthedocs.io/en/sphinx/index.html#distance-values + # 0 -> These codes represent the same language, possibly after filling in values and normalizing. + # 1- 3 -> These codes indicate a minor regional difference. + # 4 - 10 -> These codes indicate a significant but unproblematic regional difference. + if score < 10: + return closest + return None def handle_legacy_cps(self, message: Message): """intent handler for legacy CPS matches""" diff --git a/requirements.txt b/requirements.txt index 6aad041..6bbea6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ ovos-workshop>=0.1.7,<2.0.0 ovos-classifiers -ovos-utils>=0.3.4,<1.0.0 \ No newline at end of file +ovos-utils>=0.3.4,<1.0.0 +langcodes \ No newline at end of file From efba7e60497fae1fd43a563d30792ba0651f4f32 Mon Sep 17 00:00:00 2001 From: miro Date: Wed, 16 Oct 2024 01:40:25 +0100 Subject: [PATCH 5/6] fix:standardize_lang --- ocp_pipeline/opm.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ocp_pipeline/opm.py b/ocp_pipeline/opm.py index e5c9893..da90673 100644 --- a/ocp_pipeline/opm.py +++ b/ocp_pipeline/opm.py @@ -12,7 +12,7 @@ from ovos_bus_client.session import SessionManager from ovos_plugin_manager.ocp import available_extractors from ovos_plugin_manager.templates.pipeline import IntentMatch, PipelinePlugin -from ovos_utils.lang import standardize_lang_tag +from ovos_utils.lang import standardize_lang_tag, get_language_dir from ovos_utils.log import LOG from ovos_utils.messagebus import FakeBus from ovos_utils.ocp import MediaType, PlaybackType, PlaybackMode, PlayerState, OCP_ID, \ @@ -105,15 +105,16 @@ def load_resource_files(self): for lang in self.native_langs: lang = standardize_lang_tag(lang) intents[lang] = {} - locale_folder = join(dirname(__file__), "locale", lang) - for f in os.listdir(locale_folder): - path = join(locale_folder, f) - if f in self.intents: - with open(path) as intent: - samples = intent.read().split("\n") - for idx, s in enumerate(samples): - samples[idx] = s.replace("{{", "{").replace("}}", "}") - intents[lang][f] = samples + locale_folder = get_language_dir(join(dirname(__file__), "locale"), lang) + if locale_folder is not None: + for f in os.listdir(locale_folder): + path = join(locale_folder, f) + if f in self.intents: + with open(path) as intent: + samples = intent.read().split("\n") + for idx, s in enumerate(samples): + samples[idx] = s.replace("{{", "{").replace("}}", "}") + intents[lang][f] = samples return intents def register_ocp_api_events(self): From 5d350cf52b0891a4d3078d8ba4306e1c052f0ea9 Mon Sep 17 00:00:00 2001 From: miro Date: Wed, 16 Oct 2024 01:57:30 +0100 Subject: [PATCH 6/6] fix:standardize_lang --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6bbea6a..b37171d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ovos-workshop>=0.1.7,<2.0.0 ovos-classifiers -ovos-utils>=0.3.4,<1.0.0 +ovos-utils>=0.3.5,<1.0.0 langcodes \ No newline at end of file