Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Adding Azure speech, translate and CV tools #459

Merged
merged 6 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions llama_hub/tools/azure_cv/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Azure Computer Vision Tool

This tool connects to a Azure account and allows an Agent to perform a variety of computer vision tasks on image urls.

You will need to set up an api key and computer vision instance using Azure, learn more here: https://azure.microsoft.com/en-ca/products/cognitive-services/computer-vision

## Usage

This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_vision.ipynb)

Here's an example usage of the AzureCVToolSpec.

```python
from llama_hub.tools.azure_cv.base import AzureCVToolSpec
from llama_index.agent import OpenAIAgent

tool_spec = AzureCVToolSpec(api_key='your-key', resource='your-resource')

agent = OpenAIAgent.from_tools(tool_spec.to_tool_list())

agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png')
agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg')
```

`process_image`: Send an image for computer vision classification of objects, tags, captioning or OCR.

This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
1 change: 1 addition & 0 deletions llama_hub/tools/azure_cv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
## init
46 changes: 46 additions & 0 deletions llama_hub/tools/azure_cv/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Azure Cognitive Vision tool spec."""

from llama_index.tools.tool_spec.base import BaseToolSpec
from typing import Optional, List
import requests
import urllib.parse

CV_URL_TMPL = "https://{resource}.cognitiveservices.azure.com/computervision/imageanalysis:analyze"

class AzureCVToolSpec(BaseToolSpec):
"""Azure Cognitive Vision tool spec."""

spec_functions = ["process_image"]

def __init__(
self,
resource: str,
api_key: str,
language: Optional[str] = 'en',
api_version: Optional[str] = '2023-04-01-preview'
) -> None:
"""Initialize with parameters."""
self.api_key = api_key
self.cv_url = CV_URL_TMPL.format(resource=resource)
self.language = language
self.api_version = api_version

def process_image(self, url: str, features: List[str]):
"""
This tool accepts an image url or file and can process and return a variety of text depending on the use case.
You can use the features argument to configure what text you want returned.
args:
url (str): The url for the image to caption
features (List[str]): Instructions on how to process the image. Valid keys are tags, objects, read, caption
"""
response = requests.post(
f'{self.cv_url}?features={",".join(features)}&language={self.language}&api-version={self.api_version}',
headers={ 'Ocp-Apim-Subscription-Key': self.api_key },
json={'url': url}
)
response_json = response.json()
if 'read' in features:
response_json['readResult'] = response_json['readResult']['content']

return response_json
29 changes: 29 additions & 0 deletions llama_hub/tools/azure_speech/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Azure Speech Tool

This tool allows Agents to use Microsoft Azure speech services to transcribe audio files to text, and create audio files from text. To see more and get started, visit https://azure.microsoft.com/en-us/products/ai-services/ai-speech

## Usage

This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_speech.ipynb)

```python
from llama_hub.tools.azure_speech.base import AzureSpeechToolSpec
from llama_index.agent import OpenAIAgent

speech_tool = AzureSpeechToolSpec(
speech_key='your-key',
region='eastus'
)

agent = OpenAIAgent.from_tools(
speech_tool.to_tool_list(),
verbose=True,
)
print(agent.chat('Say "hello world"'))
print(agent.chat('summarize the data/speech.wav audio file into a few sentences'))
```

`text_to_speech`: Takes an input string and synthesizes audio to play on the users computer
`speech_to_text`: Takes a .wav file and transcribes it into text

This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
1 change: 1 addition & 0 deletions llama_hub/tools/azure_speech/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
## init file
77 changes: 77 additions & 0 deletions llama_hub/tools/azure_speech/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Azure Speech tool spec."""

from llama_index.tools.tool_spec.base import BaseToolSpec
from typing import Optional, List
import time

class AzureSpeechToolSpec(BaseToolSpec):
"""Azure Speech tool spec."""

spec_functions = ["speech_to_text", "text_to_speech"]

def __init__(
self,
region: str,
speech_key: str,
language: Optional[str] = "en-US"
) -> None:
import azure.cognitiveservices.speech as speechsdk
"""Initialize with parameters."""
self.config = speechsdk.SpeechConfig(subscription=speech_key, region=region)
self.config.speech_recognition_language = language

def text_to_speech(self, text: str) -> None:
"""
This tool accepts a natural language string and will use Azure speech services to create an
audio version of the text, and play it on the users computer.
args:
text (str): The text to play
"""
import azure.cognitiveservices.speech as speechsdk
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.config)
result = speech_synthesizer.speak_text(text)

if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
stream = speechsdk.AudioDataStream(result)
return 'Audio playback complete.'
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))

def _transcribe(self, speech_recognizer) -> List[str]:
done = False
results = []

def stop_cb(evt) -> None:
"""callback that stop continuous recognition"""
speech_recognizer.stop_continuous_recognition_async()
nonlocal done
done = True

speech_recognizer.recognized.connect(lambda evt, results=results: results.append(evt.result.text))
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

# Start continuous speech recognition
speech_recognizer.start_continuous_recognition_async()
while not done:
time.sleep(0.5)

return results

def speech_to_text(self, filename: str) -> List[str]:
"""
This tool accepts a filename for a speech audio file and uses Azure to transcribe it into text
args:
filename (str): The name of the file to transcribe
"""
import azure.cognitiveservices.speech as speechsdk
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=self.config,
audio_config=speechsdk.audio.AudioConfig(filename=filename)
)
return self._transcribe(speech_recognizer)
34 changes: 34 additions & 0 deletions llama_hub/tools/azure_translate/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Azure Translate Tool

This tool connects to a Azure account and allows an Agent to perform text translation into a variet of different languages

You will need to set up an api key and translate instance using Azure, learn more here: https://learn.microsoft.com/en-us/azure/ai-services/translator/translator-overview

For a full list of supported languages see here: https://learn.microsoft.com/en-us/azure/ai-services/translator/language-support

This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_speech.ipynb)

## Usage

Here's an example usage of the AzureTranslateToolSpec.

```python
from llama_index.agent import OpenAIAgent
from llama_hub.tools.azure_translate.base import AzureTranslateToolSpec

translate_tool = AzureTranslateToolSpec(
api_key='your-key',
region='eastus'
)

agent = OpenAIAgent.from_tools(
translate_tool.to_tool_list(),
verbose=True,
)
print(agent.chat('Say "hello world" in 5 different languages'))
```

`translate`: Translate text to a target language

This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.

1 change: 1 addition & 0 deletions llama_hub/tools/azure_translate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
## init
41 changes: 41 additions & 0 deletions llama_hub/tools/azure_translate/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Azure Translate tool spec."""

from llama_index.tools.tool_spec.base import BaseToolSpec
from typing import Optional, List
import requests

ENDPOINT_BASE_URL = "https://api.cognitive.microsofttranslator.com/translate"

class AzureTranslateToolSpec(BaseToolSpec):
"""Azure Translate tool spec."""

spec_functions = ["translate"]

def __init__(self, api_key: str, region: str) -> None:
"""Initialize with parameters."""
self.headers = {
'Ocp-Apim-Subscription-Key': api_key,
'Ocp-Apim-Subscription-Region': region,
'Content-type': 'application/json',
}

def translate(self, text: str, language: str):
"""
Use this tool to translate text from one language to another.
The source language will be automatically detected. You need to specify the target language
using a two character language code.
args:
language (str): Target translation language. One of af, sq, am, ar, hy, as, az, bn, ba, eu, bs, bg, ca, hr, cs, da, dv, nl, en, et, fo, fj, fi, fr, gl, ka, de, el, gu, ht, he, hi, hu, is, id, iu, ga, it, ja, kn, kk, km, ko, ku, ky, lo, lv, lt, mk, mg, ms, ml, mt, mi, mr, my, ne, nb, or, ps, fa, pl, pt, pa, ro, ru, sm, sk, sl, so, es, sw, sv, ty, ta, tt, te, th, bo, ti, to, tr, tk, uk, ur, ug, uz, vi, cy, zu
"""

request = requests.post(
ENDPOINT_BASE_URL,
params={
'api-version': '3.0',
'to': language
},
headers=self.headers,
json=[{ 'text': text }]
)
response = request.json()
return response
13 changes: 13 additions & 0 deletions llama_hub/tools/library.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
{
"AzureCVToolSpec": {
"id": "tools/azure_cv",
"author": "ajhofmann",
"keywords": ["image", "vision", "cv"]
},
"AzureSpeechToolSpec": {
"id": "tools/azure_speech",
"author": "ajhofmann"
},
"AzureTranslateToolSpec": {
"id": "tools/azure_translate",
"author": "ajhofmann"
},
"BingSearchToolSpec": {
"id": "tools/bing_search",
"author": "ajhofmann"
Expand Down
Loading