Adding Azure speech, translate and CV tools (#459)

run-llama · Aug 15, 2023 · 4a66f19 · 4a66f19
1 parent 8c46a02
commit 4a66f19
Show file tree

Hide file tree

Showing 12 changed files with 640 additions and 0 deletions.
diff --git a/llama_hub/tools/azure_cv/README.md b/llama_hub/tools/azure_cv/README.md
@@ -0,0 +1,27 @@
+# Azure Computer Vision Tool
+
+This tool connects to a Azure account and allows an Agent to perform a variety of computer vision tasks on image urls.
+
+You will need to set up an api key and computer vision instance using Azure, learn more here: https://azure.microsoft.com/en-ca/products/cognitive-services/computer-vision
+
+## Usage
+
+This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_vision.ipynb)
+
+Here's an example usage of the AzureCVToolSpec.
+
+```python
+from llama_hub.tools.azure_cv.base import AzureCVToolSpec
+from llama_index.agent import OpenAIAgent
+
+tool_spec = AzureCVToolSpec(api_key='your-key', resource='your-resource')
+
+agent = OpenAIAgent.from_tools(tool_spec.to_tool_list())
+
+agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png')
+agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg')
+```
+
+`process_image`: Send an image for computer vision classification of objects, tags, captioning or OCR.
+
+This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/llama_hub/tools/azure_cv/__init__.py b/llama_hub/tools/azure_cv/__init__.py
@@ -0,0 +1 @@
+## init
diff --git a/llama_hub/tools/azure_cv/base.py b/llama_hub/tools/azure_cv/base.py
@@ -0,0 +1,46 @@
+"""Azure Cognitive Vision tool spec."""
+
+from llama_index.tools.tool_spec.base import BaseToolSpec
+from typing import Optional, List
+import requests
+import urllib.parse
+
+CV_URL_TMPL = "https://{resource}.cognitiveservices.azure.com/computervision/imageanalysis:analyze"
+
+class AzureCVToolSpec(BaseToolSpec):
+    """Azure Cognitive Vision tool spec."""
+
+    spec_functions = ["process_image"]
+
+    def __init__(
+        self,
+        resource: str,
+        api_key: str,
+        language: Optional[str] = 'en',
+        api_version: Optional[str] = '2023-04-01-preview'
+    ) -> None:
+        """Initialize with parameters."""
+        self.api_key = api_key
+        self.cv_url = CV_URL_TMPL.format(resource=resource)
+        self.language = language
+        self.api_version = api_version
+
+    def process_image(self, url: str, features: List[str]):
+        """
+        This tool accepts an image url or file and can process and return a variety of text depending on the use case.
+        You can use the features argument to configure what text you want returned.
+
+        args:
+            url (str): The url for the image to caption
+            features (List[str]): Instructions on how to process the image. Valid keys are tags, objects, read, caption
+        """
+        response = requests.post(
+            f'{self.cv_url}?features={",".join(features)}&language={self.language}&api-version={self.api_version}',
+            headers={ 'Ocp-Apim-Subscription-Key': self.api_key },
+            json={'url': url}
+        )
+        response_json = response.json()
+        if 'read' in features:
+            response_json['readResult'] = response_json['readResult']['content']
+
+        return response_json
diff --git a/llama_hub/tools/azure_speech/README.md b/llama_hub/tools/azure_speech/README.md
@@ -0,0 +1,29 @@
+# Azure Speech Tool
+
+This tool allows Agents to use Microsoft Azure speech services to transcribe audio files to text, and create audio files from text. To see more and get started, visit https://azure.microsoft.com/en-us/products/ai-services/ai-speech
+
+## Usage
+
+This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_speech.ipynb)
+
+```python
+from llama_hub.tools.azure_speech.base import AzureSpeechToolSpec
+from llama_index.agent import OpenAIAgent
+
+speech_tool = AzureSpeechToolSpec(
+    speech_key='your-key',
+    region='eastus'
+)
+
+agent = OpenAIAgent.from_tools(
+    speech_tool.to_tool_list(),
+    verbose=True,
+)
+print(agent.chat('Say "hello world"'))
+print(agent.chat('summarize the data/speech.wav audio file into a few sentences'))
+```
+
+`text_to_speech`: Takes an input string and synthesizes audio to play on the users computer
+`speech_to_text`: Takes a .wav file and transcribes it into text
+
+This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/llama_hub/tools/azure_speech/__init__.py b/llama_hub/tools/azure_speech/__init__.py
@@ -0,0 +1 @@
+## init file
diff --git a/llama_hub/tools/azure_speech/base.py b/llama_hub/tools/azure_speech/base.py
@@ -0,0 +1,77 @@
+"""Azure Speech tool spec."""
+
+from llama_index.tools.tool_spec.base import BaseToolSpec
+from typing import Optional, List
+import time
+
+class AzureSpeechToolSpec(BaseToolSpec):
+    """Azure Speech tool spec."""
+
+    spec_functions = ["speech_to_text", "text_to_speech"]
+
+    def __init__(
+        self,
+        region: str,
+        speech_key: str,
+        language: Optional[str] = "en-US"
+    ) -> None:
+        import azure.cognitiveservices.speech as speechsdk
+        """Initialize with parameters."""
+        self.config = speechsdk.SpeechConfig(subscription=speech_key, region=region)
+        self.config.speech_recognition_language = language
+
+    def text_to_speech(self, text: str) -> None:
+        """
+        This tool accepts a natural language string and will use Azure speech services to create an
+        audio version of the text, and play it on the users computer.
+
+        args:
+            text (str): The text to play
+        """
+        import azure.cognitiveservices.speech as speechsdk
+        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.config)
+        result = speech_synthesizer.speak_text(text)
+
+        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+            stream = speechsdk.AudioDataStream(result)
+            return 'Audio playback complete.'
+        elif result.reason == speechsdk.ResultReason.Canceled:
+            cancellation_details = result.cancellation_details
+            print("Speech synthesis canceled: {}".format(cancellation_details.reason))
+            if cancellation_details.reason == speechsdk.CancellationReason.Error:
+                print("Error details: {}".format(cancellation_details.error_details))
+
+    def _transcribe(self, speech_recognizer) -> List[str]:
+        done = False
+        results = []
+
+        def stop_cb(evt) -> None:
+            """callback that stop continuous recognition"""
+            speech_recognizer.stop_continuous_recognition_async()
+            nonlocal done
+            done = True
+
+        speech_recognizer.recognized.connect(lambda evt, results=results: results.append(evt.result.text))
+        speech_recognizer.session_stopped.connect(stop_cb)
+        speech_recognizer.canceled.connect(stop_cb)
+
+        # Start continuous speech recognition
+        speech_recognizer.start_continuous_recognition_async()
+        while not done:
+            time.sleep(0.5)
+
+        return results
+
+    def speech_to_text(self, filename: str) -> List[str]:
+        """
+        This tool accepts a filename for a speech audio file and uses Azure to transcribe it into text
+
+        args:
+            filename (str): The name of the file to transcribe
+        """
+        import azure.cognitiveservices.speech as speechsdk
+        speech_recognizer = speechsdk.SpeechRecognizer(
+            speech_config=self.config,
+            audio_config=speechsdk.audio.AudioConfig(filename=filename)
+        )
+        return self._transcribe(speech_recognizer)
diff --git a/llama_hub/tools/azure_translate/README.md b/llama_hub/tools/azure_translate/README.md
@@ -0,0 +1,34 @@
+# Azure Translate Tool
+
+This tool connects to a Azure account and allows an Agent to perform text translation into a variet of different languages
+
+You will need to set up an api key and translate instance using Azure, learn more here: https://learn.microsoft.com/en-us/azure/ai-services/translator/translator-overview
+
+For a full list of supported languages see here: https://learn.microsoft.com/en-us/azure/ai-services/translator/language-support
+
+This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_speech.ipynb)
+
+## Usage
+
+Here's an example usage of the AzureTranslateToolSpec.
+
+```python
+from llama_index.agent import OpenAIAgent
+from llama_hub.tools.azure_translate.base import AzureTranslateToolSpec
+
+translate_tool = AzureTranslateToolSpec(
+    api_key='your-key',
+    region='eastus'
+)
+
+agent = OpenAIAgent.from_tools(
+    translate_tool.to_tool_list(),
+    verbose=True,
+)
+print(agent.chat('Say "hello world" in 5 different languages'))
+```
+
+`translate`: Translate text to a target language
+
+This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
+
diff --git a/llama_hub/tools/azure_translate/__init__.py b/llama_hub/tools/azure_translate/__init__.py
@@ -0,0 +1 @@
+## init
diff --git a/llama_hub/tools/azure_translate/base.py b/llama_hub/tools/azure_translate/base.py
@@ -0,0 +1,41 @@
+"""Azure Translate tool spec."""
+
+from llama_index.tools.tool_spec.base import BaseToolSpec
+from typing import Optional, List
+import requests
+
+ENDPOINT_BASE_URL = "https://api.cognitive.microsofttranslator.com/translate"
+
+class AzureTranslateToolSpec(BaseToolSpec):
+    """Azure Translate tool spec."""
+
+    spec_functions = ["translate"]
+
+    def __init__(self, api_key: str, region: str) -> None:
+        """Initialize with parameters."""
+        self.headers = {
+            'Ocp-Apim-Subscription-Key': api_key,
+            'Ocp-Apim-Subscription-Region': region,
+            'Content-type': 'application/json',
+        }
+
+    def translate(self, text: str, language: str):
+        """
+        Use this tool to translate text from one language to another.
+        The source language will be automatically detected. You need to specify the target language
+        using a two character language code.
+        args:
+            language (str): Target translation language. One of af, sq, am, ar, hy, as, az, bn, ba, eu, bs, bg, ca, hr, cs, da, dv, nl, en, et, fo, fj, fi, fr, gl, ka, de, el, gu, ht, he, hi, hu, is, id, iu, ga, it, ja, kn, kk, km, ko, ku, ky, lo, lv, lt, mk, mg, ms, ml, mt, mi, mr, my, ne, nb, or, ps, fa, pl, pt, pa, ro, ru, sm, sk, sl, so, es, sw, sv, ty, ta, tt, te, th, bo, ti, to, tr, tk, uk, ur, ug, uz, vi, cy, zu 
+        """
+
+        request = requests.post(
+            ENDPOINT_BASE_URL,
+            params={
+                'api-version': '3.0',
+                'to': language
+            },
+            headers=self.headers,
+            json=[{ 'text': text }]
+        )
+        response = request.json()
+        return response
diff --git a/llama_hub/tools/library.json b/llama_hub/tools/library.json
@@ -1,4 +1,17 @@
 {
+  "AzureCVToolSpec": {
+    "id": "tools/azure_cv",
+    "author": "ajhofmann",
+    "keywords": ["image", "vision", "cv"]
+  },
+  "AzureSpeechToolSpec": {
+    "id": "tools/azure_speech",
+    "author": "ajhofmann"
+  },
+  "AzureTranslateToolSpec": {
+    "id": "tools/azure_translate",
+    "author": "ajhofmann" 
+  },
   "BingSearchToolSpec": {
     "id": "tools/bing_search",
     "author": "ajhofmann"