Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Adding Azure speech tools
Browse files Browse the repository at this point in the history
  • Loading branch information
ajhofmann committed Aug 11, 2023
1 parent fde170a commit 9992155
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 0 deletions.
29 changes: 29 additions & 0 deletions llama_hub/tools/azure_speech/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Azure Speech Tool

This tool allows Agents to use Microsoft Azure speech services to transcribe audio files to text, and create audio files from text. To see more and get started, visit https://azure.microsoft.com/en-us/products/ai-services/ai-speech

## Usage

This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_speech.ipynb)

```python
from llama_hub.tools.azure_speech.base import AzureSpeechToolSpec
from llama_index.agent import OpenAIAgent

speech_tool = AzureSpeechToolSpec(
speech_key='your-key',
region='eastus'
)

agent = OpenAIAgent.from_tools(
speech_tool.to_tool_list(),
verbose=True,
)
print(agent.chat('Say "hello world"'))
print(agent.chat('summarize the data/speech.wav audio file into a few sentences'))
```

`text_to_speech`: Takes an input string and synthesizes audio to play on the users computer
`speech_to_text`: Takes a .wav file and transcribes it into text

This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
1 change: 1 addition & 0 deletions llama_hub/tools/azure_speech/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
## init file
77 changes: 77 additions & 0 deletions llama_hub/tools/azure_speech/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Azure Speech tool spec."""

from llama_index.tools.tool_spec.base import BaseToolSpec
from typing import Optional, List
import time

class AzureSpeechToolSpec(BaseToolSpec):
"""Azure Speech tool spec."""

spec_functions = ["speech_to_text", "text_to_speech"]

def __init__(
self,
region: str,
speech_key: str,
language: Optional[str] = "en-US"
) -> None:
import azure.cognitiveservices.speech as speechsdk
"""Initialize with parameters."""
self.config = speechsdk.SpeechConfig(subscription=speech_key, region=region)
self.config.speech_recognition_language = language

def text_to_speech(self, text: str) -> None:
"""
This tool accepts a natural language string and will use Azure speech services to create an
audio version of the text, and play it on the users computer.
args:
text (str): The text to play
"""
import azure.cognitiveservices.speech as speechsdk
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.config)
result = speech_synthesizer.speak_text(text)

if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
stream = speechsdk.AudioDataStream(result)
return 'Audio playback complete.'
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))

def _transcribe(self, speech_recognizer) -> List[str]:
done = False
results = []

def stop_cb(evt) -> None:
"""callback that stop continuous recognition"""
speech_recognizer.stop_continuous_recognition_async()
nonlocal done
done = True

speech_recognizer.recognized.connect(lambda evt, results=results: results.append(evt.result.text))
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

# Start continuous speech recognition
speech_recognizer.start_continuous_recognition_async()
while not done:
time.sleep(0.5)

return results

def speech_to_text(self, filename: str) -> List[str]:
"""
This tool accepts a filename for a speech audio file and uses Azure to transcribe it into text
args:
filename (str): The name of the file to transcribe
"""
import azure.cognitiveservices.speech as speechsdk
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=self.config,
audio_config=speechsdk.audio.AudioConfig(filename=filename)
)
return self._transcribe(speech_recognizer)
8 changes: 8 additions & 0 deletions llama_hub/tools/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
"author": "ajhofmann",
"keywords": ["image", "vision", "cv"]
},
"AzureSpeechToolSpec": {
"id": "tools/azure_speech",
"author": "ajhofmann"
},
"AzureTranslateToolSpec": {
"id": "tools/azure_translate",
"author": "ajhofmann"
},
"BingSearchToolSpec": {
"id": "tools/bing_search",
"author": "ajhofmann"
Expand Down
115 changes: 115 additions & 0 deletions llama_hub/tools/notebooks/azure_speech.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "4cdf841d-bc87-4138-96d6-f8b6b173ddd8",
"metadata": {},
"outputs": [],
"source": [
"# Setup OpenAI Agent\n",
"import openai\n",
"openai.api_key = 'sk-your-key'\n",
"from llama_index.agent import OpenAIAgent\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6358bd48-627c-47d6-9761-376877195950",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Calling Function ===\n",
"Calling function: text_to_speech with args: {\n",
" \"text\": \"hello world\"\n",
"}\n",
"Got output: Audio playback complete.\n",
"========================\n",
"\n"
]
}
],
"source": [
"from llama_hub.tools.azure_speech.base import AzureSpeechToolSpec\n",
"\n",
"speech_tool = AzureSpeechToolSpec(\n",
" speech_key='your-key',\n",
" region='eastus'\n",
")\n",
"\n",
"agent = OpenAIAgent.from_tools(\n",
" speech_tool.to_tool_list(),\n",
" verbose=True,\n",
")\n",
"print(agent.chat('Say \"hello world\"'))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "71c0244c-d9f3-4849-8571-50cee19f705b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Calling Function ===\n",
"Calling function: speech_to_text with args: {\n",
" \"filename\": \"data/speech.wav\"\n",
"}\n",
"Got output: ['Hello, thank you for calling Contoso. Who am I speaking with today?', \"Hi, my name is Mary Rondo. I'm trying to enroll myself with conscious So.\", 'Hi, Mary. Uh, are you calling because you need health insurance?', \"Yes, yeah, I'm calling to sign up for insurance.\", 'Great. Uh, if you can answer a few questions, uh, we can get you signed up in the jiffy.', 'OK.', \"So uh, what's your full name?\", 'Uh, so Mary Beth Rondo, last name is R like Romeo, O like Ocean, and like Nancy. DD like Dog and O like Ocean again.', \"Rando got it. And what's the best callback number in case we get disconnected?\", 'I only have a cell phone, so I can give you that.', \"Yeah, that'll be fine.\", \"Sure. So it's 234554 and then 9312.\", \"To confirm, it's 234-554-9312.\", \"Yep, that's right.\", \"Excellent. Uh, Let's get some additional information from your app. For your application, Uh, do you have a job?\", 'Uh, yes, I am self-employed.', 'OK, So then you have a Social Security number as well? Uh, yes, I do.', 'OK, uh, and what is your Social Security number please?', \"Uh, sure. So it's 412.\", 'Uh 256789.', 'Sorry, what was that, A-25 or A225 you cut out for a bit?', \"Uh, it's 22 so.\", '412, then another two, then five.', 'Hey, thank you so much and could I have your e-mail address please?', \"Yeah, it's Mary [email protected], so [email protected]. No periods, no dashes.\", \"Great. Uh, that is the last question. So let me take your information and I'll be able to get you signed up right away. Thank you for calling Contoso and I'll be able to get you signed up immediately. One of our agents will call you back in about 24 hours or so to confirm your application.\", 'That sounds great. Thank you.', 'Absolutely. If you need anything else, please give us a call at 1-800-555-5564 ext 123. Thank you very much for calling Contoso.', 'Uh, actually uh, sorry, one more question.', 'Ohh yes, of course.', \"I'm curious what I'd be getting a physical card as proof of coverage.\", 'So the default is a digital membership card, but we can send you a physical card if you prefer.', \"Uh, yes. Could you please mail it to me when it's ready? I'd like to have it shipped to you from my address.\", 'Ohh yeah.', \"Uh so it's 2660 Unit A on Maple Ave. SE Lansing and then zip code is 48823.\", \"Absolutely. I've made a note on your file.\", 'Awesome. Thanks so much.', \"You're very welcome. Thank you for calling Contoso and have a great day.\"]\n",
"========================\n",
"The audio file contains a conversation between a customer named Mary Rondo and a representative from Contoso. Mary is calling to enroll herself in health insurance. The representative asks for Mary's full name, callback number, and email address. They also ask about Mary's employment status and Social Security number. The conversation ends with the representative confirming that they will get Mary signed up and that an agent will call her back to confirm the application. Mary asks about receiving a physical card as proof of coverage and provides her mailing address. The representative notes the request and thanks Mary for calling Contoso.\n"
]
}
],
"source": [
"print(agent.chat('summarize the data/speech.wav audio file into a few sentences'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb814f3b-abd6-4910-8c53-5c6001fda04c",
"metadata": {},
"outputs": [],
"source": [
"print(agent.chat('rate the customer service agents performance based on the conversation'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9030a767-8708-4642-9232-1f62ffe97b50",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file added llama_hub/tools/notebooks/data/speech.wav
Binary file not shown.

0 comments on commit 9992155

Please sign in to comment.