This repository has been archived by the owner on Mar 1, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 736
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding Azure speech, translate and CV tools (#459)
- Loading branch information
Showing
12 changed files
with
640 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Azure Computer Vision Tool | ||
|
||
This tool connects to a Azure account and allows an Agent to perform a variety of computer vision tasks on image urls. | ||
|
||
You will need to set up an api key and computer vision instance using Azure, learn more here: https://azure.microsoft.com/en-ca/products/cognitive-services/computer-vision | ||
|
||
## Usage | ||
|
||
This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_vision.ipynb) | ||
|
||
Here's an example usage of the AzureCVToolSpec. | ||
|
||
```python | ||
from llama_hub.tools.azure_cv.base import AzureCVToolSpec | ||
from llama_index.agent import OpenAIAgent | ||
|
||
tool_spec = AzureCVToolSpec(api_key='your-key', resource='your-resource') | ||
|
||
agent = OpenAIAgent.from_tools(tool_spec.to_tool_list()) | ||
|
||
agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png') | ||
agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg') | ||
``` | ||
|
||
`process_image`: Send an image for computer vision classification of objects, tags, captioning or OCR. | ||
|
||
This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
## init |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
"""Azure Cognitive Vision tool spec.""" | ||
|
||
from llama_index.tools.tool_spec.base import BaseToolSpec | ||
from typing import Optional, List | ||
import requests | ||
import urllib.parse | ||
|
||
CV_URL_TMPL = "https://{resource}.cognitiveservices.azure.com/computervision/imageanalysis:analyze" | ||
|
||
class AzureCVToolSpec(BaseToolSpec): | ||
"""Azure Cognitive Vision tool spec.""" | ||
|
||
spec_functions = ["process_image"] | ||
|
||
def __init__( | ||
self, | ||
resource: str, | ||
api_key: str, | ||
language: Optional[str] = 'en', | ||
api_version: Optional[str] = '2023-04-01-preview' | ||
) -> None: | ||
"""Initialize with parameters.""" | ||
self.api_key = api_key | ||
self.cv_url = CV_URL_TMPL.format(resource=resource) | ||
self.language = language | ||
self.api_version = api_version | ||
|
||
def process_image(self, url: str, features: List[str]): | ||
""" | ||
This tool accepts an image url or file and can process and return a variety of text depending on the use case. | ||
You can use the features argument to configure what text you want returned. | ||
args: | ||
url (str): The url for the image to caption | ||
features (List[str]): Instructions on how to process the image. Valid keys are tags, objects, read, caption | ||
""" | ||
response = requests.post( | ||
f'{self.cv_url}?features={",".join(features)}&language={self.language}&api-version={self.api_version}', | ||
headers={ 'Ocp-Apim-Subscription-Key': self.api_key }, | ||
json={'url': url} | ||
) | ||
response_json = response.json() | ||
if 'read' in features: | ||
response_json['readResult'] = response_json['readResult']['content'] | ||
|
||
return response_json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# Azure Speech Tool | ||
|
||
This tool allows Agents to use Microsoft Azure speech services to transcribe audio files to text, and create audio files from text. To see more and get started, visit https://azure.microsoft.com/en-us/products/ai-services/ai-speech | ||
|
||
## Usage | ||
|
||
This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_speech.ipynb) | ||
|
||
```python | ||
from llama_hub.tools.azure_speech.base import AzureSpeechToolSpec | ||
from llama_index.agent import OpenAIAgent | ||
|
||
speech_tool = AzureSpeechToolSpec( | ||
speech_key='your-key', | ||
region='eastus' | ||
) | ||
|
||
agent = OpenAIAgent.from_tools( | ||
speech_tool.to_tool_list(), | ||
verbose=True, | ||
) | ||
print(agent.chat('Say "hello world"')) | ||
print(agent.chat('summarize the data/speech.wav audio file into a few sentences')) | ||
``` | ||
|
||
`text_to_speech`: Takes an input string and synthesizes audio to play on the users computer | ||
`speech_to_text`: Takes a .wav file and transcribes it into text | ||
|
||
This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
## init file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
"""Azure Speech tool spec.""" | ||
|
||
from llama_index.tools.tool_spec.base import BaseToolSpec | ||
from typing import Optional, List | ||
import time | ||
|
||
class AzureSpeechToolSpec(BaseToolSpec): | ||
"""Azure Speech tool spec.""" | ||
|
||
spec_functions = ["speech_to_text", "text_to_speech"] | ||
|
||
def __init__( | ||
self, | ||
region: str, | ||
speech_key: str, | ||
language: Optional[str] = "en-US" | ||
) -> None: | ||
import azure.cognitiveservices.speech as speechsdk | ||
"""Initialize with parameters.""" | ||
self.config = speechsdk.SpeechConfig(subscription=speech_key, region=region) | ||
self.config.speech_recognition_language = language | ||
|
||
def text_to_speech(self, text: str) -> None: | ||
""" | ||
This tool accepts a natural language string and will use Azure speech services to create an | ||
audio version of the text, and play it on the users computer. | ||
args: | ||
text (str): The text to play | ||
""" | ||
import azure.cognitiveservices.speech as speechsdk | ||
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.config) | ||
result = speech_synthesizer.speak_text(text) | ||
|
||
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: | ||
stream = speechsdk.AudioDataStream(result) | ||
return 'Audio playback complete.' | ||
elif result.reason == speechsdk.ResultReason.Canceled: | ||
cancellation_details = result.cancellation_details | ||
print("Speech synthesis canceled: {}".format(cancellation_details.reason)) | ||
if cancellation_details.reason == speechsdk.CancellationReason.Error: | ||
print("Error details: {}".format(cancellation_details.error_details)) | ||
|
||
def _transcribe(self, speech_recognizer) -> List[str]: | ||
done = False | ||
results = [] | ||
|
||
def stop_cb(evt) -> None: | ||
"""callback that stop continuous recognition""" | ||
speech_recognizer.stop_continuous_recognition_async() | ||
nonlocal done | ||
done = True | ||
|
||
speech_recognizer.recognized.connect(lambda evt, results=results: results.append(evt.result.text)) | ||
speech_recognizer.session_stopped.connect(stop_cb) | ||
speech_recognizer.canceled.connect(stop_cb) | ||
|
||
# Start continuous speech recognition | ||
speech_recognizer.start_continuous_recognition_async() | ||
while not done: | ||
time.sleep(0.5) | ||
|
||
return results | ||
|
||
def speech_to_text(self, filename: str) -> List[str]: | ||
""" | ||
This tool accepts a filename for a speech audio file and uses Azure to transcribe it into text | ||
args: | ||
filename (str): The name of the file to transcribe | ||
""" | ||
import azure.cognitiveservices.speech as speechsdk | ||
speech_recognizer = speechsdk.SpeechRecognizer( | ||
speech_config=self.config, | ||
audio_config=speechsdk.audio.AudioConfig(filename=filename) | ||
) | ||
return self._transcribe(speech_recognizer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Azure Translate Tool | ||
|
||
This tool connects to a Azure account and allows an Agent to perform text translation into a variet of different languages | ||
|
||
You will need to set up an api key and translate instance using Azure, learn more here: https://learn.microsoft.com/en-us/azure/ai-services/translator/translator-overview | ||
|
||
For a full list of supported languages see here: https://learn.microsoft.com/en-us/azure/ai-services/translator/language-support | ||
|
||
This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_speech.ipynb) | ||
|
||
## Usage | ||
|
||
Here's an example usage of the AzureTranslateToolSpec. | ||
|
||
```python | ||
from llama_index.agent import OpenAIAgent | ||
from llama_hub.tools.azure_translate.base import AzureTranslateToolSpec | ||
|
||
translate_tool = AzureTranslateToolSpec( | ||
api_key='your-key', | ||
region='eastus' | ||
) | ||
|
||
agent = OpenAIAgent.from_tools( | ||
translate_tool.to_tool_list(), | ||
verbose=True, | ||
) | ||
print(agent.chat('Say "hello world" in 5 different languages')) | ||
``` | ||
|
||
`translate`: Translate text to a target language | ||
|
||
This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
## init |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
"""Azure Translate tool spec.""" | ||
|
||
from llama_index.tools.tool_spec.base import BaseToolSpec | ||
from typing import Optional, List | ||
import requests | ||
|
||
ENDPOINT_BASE_URL = "https://api.cognitive.microsofttranslator.com/translate" | ||
|
||
class AzureTranslateToolSpec(BaseToolSpec): | ||
"""Azure Translate tool spec.""" | ||
|
||
spec_functions = ["translate"] | ||
|
||
def __init__(self, api_key: str, region: str) -> None: | ||
"""Initialize with parameters.""" | ||
self.headers = { | ||
'Ocp-Apim-Subscription-Key': api_key, | ||
'Ocp-Apim-Subscription-Region': region, | ||
'Content-type': 'application/json', | ||
} | ||
|
||
def translate(self, text: str, language: str): | ||
""" | ||
Use this tool to translate text from one language to another. | ||
The source language will be automatically detected. You need to specify the target language | ||
using a two character language code. | ||
args: | ||
language (str): Target translation language. One of af, sq, am, ar, hy, as, az, bn, ba, eu, bs, bg, ca, hr, cs, da, dv, nl, en, et, fo, fj, fi, fr, gl, ka, de, el, gu, ht, he, hi, hu, is, id, iu, ga, it, ja, kn, kk, km, ko, ku, ky, lo, lv, lt, mk, mg, ms, ml, mt, mi, mr, my, ne, nb, or, ps, fa, pl, pt, pa, ro, ru, sm, sk, sl, so, es, sw, sv, ty, ta, tt, te, th, bo, ti, to, tr, tk, uk, ur, ug, uz, vi, cy, zu | ||
""" | ||
|
||
request = requests.post( | ||
ENDPOINT_BASE_URL, | ||
params={ | ||
'api-version': '3.0', | ||
'to': language | ||
}, | ||
headers=self.headers, | ||
json=[{ 'text': text }] | ||
) | ||
response = request.json() | ||
return response |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.