From deeabc18e9123c966271ca95c2ef7ef9f001e992 Mon Sep 17 00:00:00 2001
From: NeonJarbas <59943014+NeonJarbas@users.noreply.github.com>
Date: Mon, 8 Jan 2024 00:15:44 +0000
Subject: [PATCH 1/3] feat/ocp_voc_match (#172)

* Increment Version to 0.0.16a3

* OCP skill keyword matching utils

ocp-nlp bus api

* leave bus messages for later, keep PR focused on ocp_voc_match

* bump

* bump

* load_ocp_keyword_from_csv

---------

Co-authored-by: NeonJarbas <NeonJarbas@users.noreply.github.com>
Co-authored-by: JarbasAi <jarbasai@mailfence.com>
---
 .github/workflows/unit_tests.yml    |  10 +-
 ovos_workshop/decorators/ocp.py     | 132 +-----------------------
 ovos_workshop/skills/common_play.py | 151 ++++++++++++++++++++++++++--
 requirements/requirements.txt       |   5 +-
 4 files changed, 154 insertions(+), 144 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 9d5c634b..d25526b5 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -12,7 +12,7 @@ on:
       - 'LICENSE'
       - 'CHANGELOG.md'
       - 'MANIFEST.in'
-      - 'readme.md'
+      - 'README.md'
       - 'scripts/**'
   push:
     branches:
@@ -26,7 +26,7 @@ on:
       - 'LICENSE'
       - 'CHANGELOG.md'
       - 'MANIFEST.in'
-      - 'readme.md'
+      - 'README.md'
       - 'scripts/**'
   workflow_dispatch:
 
@@ -52,13 +52,13 @@ jobs:
           sudo apt-get update
           sudo apt install python3-dev
           python -m pip install build wheel
-      - name: Install ovos workshop
-        run: |
-          pip install -e .
       - name: Install test dependencies
         run: |
           sudo apt install libssl-dev libfann-dev portaudio19-dev libpulse-dev
           pip install -r requirements/test.txt
+      - name: Install ovos workshop
+        run: |
+          pip install -e .
       - name: Run unittests
         run: |
           pytest --cov=ovos_workshop --cov-report xml test/unittests
diff --git a/ovos_workshop/decorators/ocp.py b/ovos_workshop/decorators/ocp.py
index 164563a0..c8449816 100644
--- a/ovos_workshop/decorators/ocp.py
+++ b/ovos_workshop/decorators/ocp.py
@@ -1,3 +1,6 @@
+# backwards compat imports
+from ovos_utils.ocp import MediaType, PlayerState, MediaState, MatchConfidence,\
+    PlaybackType, PlaybackMode, LoopState, TrackState
 
 
 def ocp_search():
@@ -124,132 +127,3 @@ def real_decorator(func):
 
     return real_decorator
 
-
-try:
-    from ovos_plugin_common_play.ocp.status import MediaType, PlayerState, \
-        MediaState, MatchConfidence, PlaybackType, PlaybackMode, LoopState, \
-        TrackState
-except ImportError:
-
-    # TODO - manually keep these in sync as needed
-    # apps interfacing with OCP need the enums,
-    # but they are native to OCP does not make sense for OCP to import them from here,
-    # therefore we duplicate them when needed
-    from enum import IntEnum
-
-
-    class MatchConfidence(IntEnum):
-        EXACT = 95
-        VERY_HIGH = 90
-        HIGH = 80
-        AVERAGE_HIGH = 70
-        AVERAGE = 50
-        AVERAGE_LOW = 30
-        LOW = 15
-        VERY_LOW = 1
-
-
-    class TrackState(IntEnum):
-        DISAMBIGUATION = 1  # media result, not queued for playback
-
-        PLAYING_SKILL = 20  # Skill is handling playback internally
-        PLAYING_AUDIOSERVICE = 21  # Skill forwarded playback to audio service
-        PLAYING_VIDEO = 22  # Skill forwarded playback to gui player
-        PLAYING_AUDIO = 23  # Skill forwarded audio playback to gui player
-        PLAYING_MPRIS = 24  # External media player is handling playback
-        PLAYING_WEBVIEW = 25  # Media playback handled in browser (eg. javascript)
-
-        QUEUED_SKILL = 30  # Waiting playback to be handled inside skill
-        QUEUED_AUDIOSERVICE = 31  # Waiting playback in audio service
-        QUEUED_VIDEO = 32  # Waiting playback in gui
-        QUEUED_AUDIO = 33  # Waiting playback in gui
-        QUEUED_WEBVIEW = 34  # Waiting playback in gui
-
-
-    class MediaState(IntEnum):
-        # https://doc.qt.io/qt-5/qmediaplayer.html#MediaStatus-enum
-        # The status of the media cannot be determined.
-        UNKNOWN = 0
-        # There is no current media. PlayerState == STOPPED
-        NO_MEDIA = 1
-        # The current media is being loaded. The player may be in any state.
-        LOADING_MEDIA = 2
-        # The current media has been loaded. PlayerState== STOPPED
-        LOADED_MEDIA = 3
-        # Playback of the current media has stalled due to
-        # insufficient buffering or some other temporary interruption.
-        # PlayerState != STOPPED
-        STALLED_MEDIA = 4
-        # The player is buffering data but has enough data buffered
-        # for playback to continue for the immediate future.
-        # PlayerState != STOPPED
-        BUFFERING_MEDIA = 5
-        # The player has fully buffered the current media. PlayerState != STOPPED
-        BUFFERED_MEDIA = 6
-        # Playback has reached the end of the current media. PlayerState == STOPPED
-        END_OF_MEDIA = 7
-        # The current media cannot be played. PlayerState == STOPPED
-        INVALID_MEDIA = 8
-
-
-    class PlayerState(IntEnum):
-        # https://doc.qt.io/qt-5/qmediaplayer.html#State-enum
-        STOPPED = 0
-        PLAYING = 1
-        PAUSED = 2
-
-
-    class LoopState(IntEnum):
-        NONE = 0
-        REPEAT = 1
-        REPEAT_TRACK = 2
-
-
-    class PlaybackType(IntEnum):
-        SKILL = 0  # skills handle playback whatever way they see fit,
-        # eg spotify / mycroft common play
-        VIDEO = 1  # Video results
-        AUDIO = 2  # Results should be played audio only
-        AUDIO_SERVICE = 3  # Results should be played without using the GUI
-        MPRIS = 4  # External MPRIS compliant player
-        WEBVIEW = 5  # GUI webview, render a url instead of media player
-        UNDEFINED = 100  # data not available, hopefully status will be updated soon..
-
-
-    class PlaybackMode(IntEnum):
-        AUTO = 0  # play each entry as considered appropriate,
-        # ie, make it happen the best way possible
-        AUDIO_ONLY = 10  # only consider audio entries
-        VIDEO_ONLY = 20  # only consider video entries
-        FORCE_AUDIO = 30  # cast video to audio unconditionally
-        # (audio can still play in mycroft-gui)
-        FORCE_AUDIOSERVICE = 40  # cast everything to audio service backend,
-        # mycroft-gui will not be used
-        EVENTS_ONLY = 50  # only emit ocp events, do not display or play anything.
-        # allows integration with external interfaces
-
-
-    class MediaType(IntEnum):
-        GENERIC = 0
-        AUDIO = 1
-        MUSIC = 2
-        VIDEO = 3
-        AUDIOBOOK = 4
-        GAME = 5
-        PODCAST = 6
-        RADIO = 7
-        NEWS = 8
-        TV = 9
-        MOVIE = 10
-        TRAILER = 11
-        VISUAL_STORY = 13
-        BEHIND_THE_SCENES = 14
-        DOCUMENTARY = 15
-        RADIO_THEATRE = 16
-        SHORT_FILM = 17
-        SILENT_MOVIE = 18
-        BLACK_WHITE_MOVIE = 20
-        CARTOON = 21
-
-        ADULT = 69
-        HENTAI = 70
diff --git a/ovos_workshop/skills/common_play.py b/ovos_workshop/skills/common_play.py
index 8d063cd4..7d7472d5 100644
--- a/ovos_workshop/skills/common_play.py
+++ b/ovos_workshop/skills/common_play.py
@@ -3,11 +3,15 @@
 from ovos_workshop.skills.ovos import OVOSSkill
 from ovos_bus_client import Message
 from ovos_utils.log import LOG
-
+from ovos_utils import camel_case_split
+from typing import List
+from ovos_classifiers.skovos.features import KeywordFeatures
 
 # backwards compat imports, do not delete, skills import from here
 from ovos_workshop.decorators.ocp import ocp_play, ocp_next, ocp_pause, ocp_resume, ocp_search, \
-    ocp_previous, ocp_featured_media, MediaType, MediaState, MatchConfidence, \
+    ocp_previous, ocp_featured_media
+
+from ovos_utils.ocp import MediaType, MediaState, MatchConfidence, \
     PlaybackType, PlaybackMode, PlayerState, LoopState, TrackState
 
 
@@ -50,10 +54,13 @@ def ...
     vocab for starting playback is needed.
     """
 
-    def __init__(self, name=None, bus=None, **kwargs):
+    def __init__(self, *args, **kwargs):
         # NOTE: derived skills will likely want to override this list
-        self.supported_media = [MediaType.GENERIC,
-                                MediaType.AUDIO]
+        self.supported_media = [MediaType.GENERIC]
+        skill_name = camel_case_split(self.__class__.__name__)
+        alt = skill_name.replace(" skill", "").replace(" Skill", "")
+        self.skill_aliases = [skill_name, alt]
+
         self._search_handlers = []  # added via decorators
         self._featured_handlers = []  # added via decorators
         self._current_query = None
@@ -68,7 +75,9 @@ def __init__(self, name=None, bus=None, **kwargs):
         self.skill_icon = \
             "https://github.com/OpenVoiceOS/ovos-ocp-audio-plugin/raw/master/" \
             "ovos_plugin_common_play/ocp/res/ui/images/ocp.png"
-        OVOSSkill.__init__(self, name, bus, **kwargs)
+
+        self.ocp_matchers = {}
+        super().__init__(*args, **kwargs)
 
     def bind(self, bus):
         """Overrides the normal bind method.
@@ -104,15 +113,140 @@ def bind(self, bus):
             self.add_event("mycroft.stop",
                            self.__handle_stop_search)
 
-    def __handle_ocp_skills_get(self, message):
+    def register_media_type(self, media_type: MediaType):
+        """ helper instead of editing self.supported_media directly
+        will auto-sync changes via bus"""
+        if media_type not in self.supported_media:
+            self.supported_media.append(media_type)
+            LOG.info(f"{self.skill_id} registered type {media_type}")
+            self.__handle_ocp_skills_get()
+
+    def __handle_ocp_skills_get(self, message=None):
+        """ report skill OCP info
+
+        thumbnail and featured tracks inform the OCP homescreen
+
+        media_type and skill_name help the classifier disambiguate between media_types
+            eg, if OCP finds the name of a movie skill in user utterance
+                it will search netflix instead of spotify
+         """
+        message = message or Message("")
         self.bus.emit(
             message.reply('ovos.common_play.announce',
                           {"skill_id": self.skill_id,
-                           "skill_name": self.name,
+                           "skill_name": self.skill_name,
+                           "aliases": self.skill_aliases,
                            "thumbnail": self.skill_icon,
                            "media_type": self.supported_media,
                            "featured_tracks": len(self._featured_handlers) >= 1}))
 
+    def ocp_voc_match(self, utterance, lang=None):
+        """uses Aho–Corasick algorithm to match OCP keywords
+        this efficiently matches many keywords against an utterance
+
+        OCP keywords are registered via self.register_ocp_keyword
+
+        example usages
+            print(self.ocp_voc_match("play metallica"))
+            # {'album_name': 'Metallica', 'artist_name': 'Metallica'}
+
+            print(self.ocp_voc_match("play the beatles"))
+            # {'album_name': 'The Beatles', 'series_name': 'The Beatles',
+            # 'artist_name': 'The Beatles', 'movie_name': 'The Beatles'}
+
+            print(self.ocp_voc_match("play rob zombie"))
+            # {'artist_name': 'Rob Zombie', 'album_name': 'Zombie',
+            # 'book_name': 'Zombie', 'game_name': 'Zombie', 'movie_name': 'Zombie'}
+
+            print(self.ocp_voc_match("play horror movie"))
+            # {'film_genre': 'Horror', 'cartoon_genre': 'Horror', 'anime_genre': 'Horror',
+            # 'radio_drama_genre': 'horror', 'video_genre': 'horror',
+            # 'book_genre': 'Horror', 'movie_name': 'Horror Movie'}
+
+            print(self.ocp_voc_match("play science fiction"))
+            #  {'film_genre': 'Science Fiction', 'cartoon_genre': 'Science Fiction',
+            #  'podcast_genre': 'Fiction', 'anime_genre': 'Science Fiction',
+            #  'documentary_genre': 'Science', 'book_genre': 'Science Fiction',
+            #  'artist_name': 'Fiction', 'tv_channel': 'Science',
+            #  'album_name': 'Science Fiction', 'short_film_name': 'Science',
+            #  'book_name': 'Science Fiction', 'movie_name': 'Science Fiction'}
+        """
+        lang = lang or self.lang
+        if lang not in self.ocp_matchers:
+            return {}
+        matches = {}
+        for k, v in self.ocp_matchers[lang].match(utterance):
+            if k not in matches or len(v) > len(matches[k]):
+                matches[k] = v
+        return matches
+
+    def load_ocp_keyword_from_csv(self, csv_path: str, lang: str):
+        """ load entities from a .csv file for usage with self.ocp_voc_match
+        see the ocp_entities.csv datatsets for example files built from wikidata SPARQL queries
+
+        examples contents of csv file
+
+            label,entity
+            film_genre,swashbuckler film
+            film_genre,neo-noir
+            film_genre,actual play film
+            film_genre,alternate history film
+            film_genre,spy film
+            ...
+        """
+        if lang not in self.ocp_matchers:
+            self.ocp_matchers[lang] = KeywordFeatures()
+        self.ocp_matchers[lang].load_entities(csv_path)
+
+    def register_ocp_keyword(self, media_type: MediaType, label: str,
+                             samples: List, langs: List[str] = None):
+        """ register strings as native OCP keywords (eg, movie_name, artist_name ...)
+
+        ocp keywords can be efficiently matched with self.ocp_match helper method
+        that uses Aho–Corasick algorithm
+        """
+        langs = langs or self.native_langs
+        for l in langs:
+            if l not in self.ocp_matchers:
+                self.ocp_matchers[l] = KeywordFeatures()
+            self.ocp_matchers[l].register_entity(label, samples)
+
+        # TODO - send bus message once Pipeline is in
+        #  if the label is a valid OCP entity known by the classifier it will help
+        #  the classifier disambiguate between media_types
+        #  eg, if OCP finds a movie name in user utterances it will
+        #      prefer to search netflix instead of spotify
+        # right now only used for internal matching
+        # NB: consider sending a file path,
+        # bus messages with thousands of entities dont work well
+        #self.bus.emit(
+        #    Message('ovos.common_play.register_keyword',
+        #            {"skill_id": self.skill_id,
+        #             "label": label,  # if in OCP_ENTITIES it influences classifier
+        #             "langs": langs,
+        #             "samples": samples,
+        #             "media_type": media_type}))
+
+    def deregister_ocp_keyword(self, media_type: MediaType, label: str,
+                               langs: List[str] = None):
+        langs = langs or self.native_langs
+        for l in langs:
+            if l in self.ocp_matchers:
+                self.ocp_matchers[l].deregister_entity(label)
+
+        # TODO - send bus message once Pipeline is in
+        #  if the label is a valid OCP entity known by the classifier it will help
+        #  the classifier disambiguate between media_types
+        #  eg, if OCP finds a movie name in user utterances it will
+        #      prefer to search netflix instead of spotify
+        # right now only used for internal matching
+        #self.bus.emit(
+        #    Message('ovos.common_play.deregister_keyword',
+        #            {"skill_id": self.skill_id,
+        #             "label": label,
+        #             "langs": langs,
+        #             "media_type": media_type}))
+
     def _register_decorated(self):
         # register search handlers
         for attr_name in get_non_properties(self):
@@ -162,6 +296,7 @@ def _register_decorated(self):
                         LOG.warning("multiple declarations of resume playback"
                                     "handler, replacing previous handler")
                     self.__resume_handler = method
+
         super()._register_decorated()
 
         # volunteer info to OCP
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 9981a78c..f524799c 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,6 +1,7 @@
-ovos-utils < 0.2.0, >=0.0.37
+ovos-utils >= 0.1.0a7, < 0.2.0
+ovos-bus-client < 0.1.0, >=0.0.9a2
 ovos_config < 0.2.0,>=0.0.12
-ovos_bus_client < 0.2.0, >=0.0.8
 ovos_backend_client < 0.2.0, >=0.1.0
 ovos-lingua-franca~=0.4, >=0.4.6
+ovos_classifiers>=0.0.0a46
 rapidfuzz

From c95ca8ec12407281a6f9f4d7829ee233412f7200 Mon Sep 17 00:00:00 2001
From: JarbasAl <JarbasAl@users.noreply.github.com>
Date: Mon, 8 Jan 2024 00:16:01 +0000
Subject: [PATCH 2/3] Increment Version to 0.0.16a5

---
 ovos_workshop/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ovos_workshop/version.py b/ovos_workshop/version.py
index d2ff723f..d26fcd71 100644
--- a/ovos_workshop/version.py
+++ b/ovos_workshop/version.py
@@ -3,5 +3,5 @@
 VERSION_MAJOR = 0
 VERSION_MINOR = 0
 VERSION_BUILD = 16
-VERSION_ALPHA = 4
+VERSION_ALPHA = 5
 # END_VERSION_BLOCK

From 2adaf46091e05f1a8791d3a9144a477a539aacd8 Mon Sep 17 00:00:00 2001
From: JarbasAl <JarbasAl@users.noreply.github.com>
Date: Mon, 8 Jan 2024 00:16:32 +0000
Subject: [PATCH 3/3] Update Changelog

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e3fa5034..7bcaf247 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## [0.0.16a5](https://github.com/OpenVoiceOS/OVOS-workshop/tree/0.0.16a5) (2024-01-08)
+
+[Full Changelog](https://github.com/OpenVoiceOS/OVOS-workshop/compare/0.0.16a4...0.0.16a5)
+
+**Implemented enhancements:**
+
+- feat/ocp\_voc\_match [\#172](https://github.com/OpenVoiceOS/OVOS-workshop/pull/172) ([NeonJarbas](https://github.com/NeonJarbas))
+
 ## [0.0.16a4](https://github.com/OpenVoiceOS/OVOS-workshop/tree/0.0.16a4) (2024-01-06)
 
 [Full Changelog](https://github.com/OpenVoiceOS/OVOS-workshop/compare/0.0.16a3...0.0.16a4)