Adding Azure speech tools

run-llama · Aug 11, 2023 · 86076da · 86076da
1 parent 091b090
commit 86076da
Show file tree

Hide file tree

Showing 6 changed files with 230 additions and 0 deletions.
diff --git a/llama_hub/tools/azure_speech/README.md b/llama_hub/tools/azure_speech/README.md
@@ -0,0 +1,29 @@
+# Azure Speech Tool
+
+This tool allows Agents to use Microsoft Azure speech services to transcribe audio files to text, and create audio files from text. To see more and get started, visit https://azure.microsoft.com/en-us/products/ai-services/ai-speech
+
+## Usage
+
+This tool has a more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_speech.ipynb)
+
+```python
+from llama_hub.tools.azure_speech.base import AzureSpeechToolSpec
+from llama_index.agent import OpenAIAgent
+
+speech_tool = AzureSpeechToolSpec(
+    speech_key='your-key',
+    region='eastus'
+)
+
+agent = OpenAIAgent.from_tools(
+    speech_tool.to_tool_list(),
+    verbose=True,
+)
+print(agent.chat('Say "hello world"'))
+print(agent.chat('summarize the data/speech.wav audio file into a few sentences'))
+```
+
+`text_to_speech`: Takes an input string and synthesizes audio to play on the users computer
+`speech_to_text`: Takes a .wav file and transcribes it into text
+
+This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/llama_hub/tools/azure_speech/__init__.py b/llama_hub/tools/azure_speech/__init__.py
@@ -0,0 +1 @@
+## init file
diff --git a/llama_hub/tools/azure_speech/base.py b/llama_hub/tools/azure_speech/base.py
@@ -0,0 +1,77 @@
+"""Azure Speech tool spec."""
+
+from llama_index.tools.tool_spec.base import BaseToolSpec
+from typing import Optional, List
+import time
+
+class AzureSpeechToolSpec(BaseToolSpec):
+    """Azure Speech tool spec."""
+
+    spec_functions = ["speech_to_text", "text_to_speech"]
+
+    def __init__(
+        self,
+        region: str,
+        speech_key: str,
+        language: Optional[str] = "en-US"
+    ) -> None:
+        import azure.cognitiveservices.speech as speechsdk
+        """Initialize with parameters."""
+        self.config = speechsdk.SpeechConfig(subscription=speech_key, region=region)
+        self.config.speech_recognition_language = language
+
+    def text_to_speech(self, text: str) -> None:
+        """
+        This tool accepts a natural language string and will use Azure speech services to create an
+        audio version of the text, and play it on the users computer.
+
+        args:
+            text (str): The text to play
+        """
+        import azure.cognitiveservices.speech as speechsdk
+        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.config)
+        result = speech_synthesizer.speak_text(text)
+
+        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+            stream = speechsdk.AudioDataStream(result)
+            return 'Audio playback complete.'
+        elif result.reason == speechsdk.ResultReason.Canceled:
+            cancellation_details = result.cancellation_details
+            print("Speech synthesis canceled: {}".format(cancellation_details.reason))
+            if cancellation_details.reason == speechsdk.CancellationReason.Error:
+                print("Error details: {}".format(cancellation_details.error_details))
+
+    def _transcribe(self, speech_recognizer) -> List[str]:
+        done = False
+        results = []
+
+        def stop_cb(evt) -> None:
+            """callback that stop continuous recognition"""
+            speech_recognizer.stop_continuous_recognition_async()
+            nonlocal done
+            done = True
+
+        speech_recognizer.recognized.connect(lambda evt, results=results: results.append(evt.result.text))
+        speech_recognizer.session_stopped.connect(stop_cb)
+        speech_recognizer.canceled.connect(stop_cb)
+
+        # Start continuous speech recognition
+        speech_recognizer.start_continuous_recognition_async()
+        while not done:
+            time.sleep(0.5)
+
+        return results
+
+    def speech_to_text(self, filename: str) -> List[str]:
+        """
+        This tool accepts a filename for a speech audio file and uses Azure to transcribe it into text
+
+        args:
+            filename (str): The name of the file to transcribe
+        """
+        import azure.cognitiveservices.speech as speechsdk
+        speech_recognizer = speechsdk.SpeechRecognizer(
+            speech_config=self.config,
+            audio_config=speechsdk.audio.AudioConfig(filename=filename)
+        )
+        return self._transcribe(speech_recognizer)
diff --git a/llama_hub/tools/library.json b/llama_hub/tools/library.json
@@ -1,4 +1,12 @@
 {
+  "AzureSpeechToolSpec": {
+    "id": "tools/azure_speech",
+    "author": "ajhofmann"
+  },
+  "AzureTranslateToolSpec": {
+    "id": "tools/azure_translate",
+    "author": "ajhofmann" 
+  },
   "BingSearchToolSpec": {
     "id": "tools/bing_search",
     "author": "ajhofmann"

diff --git a/llama_hub/tools/notebooks/azure_speech.ipynb b/llama_hub/tools/notebooks/azure_speech.ipynb
@@ -0,0 +1,115 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4cdf841d-bc87-4138-96d6-f8b6b173ddd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup OpenAI Agent\n",
+    "import openai\n",
+    "openai.api_key = 'sk-your-key'\n",
+    "from llama_index.agent import OpenAIAgent\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6358bd48-627c-47d6-9761-376877195950",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Calling Function ===\n",
+      "Calling function: text_to_speech with args: {\n",
+      "  \"text\": \"hello world\"\n",
+      "}\n",
+      "Got output: Audio playback complete.\n",
+      "========================\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llama_hub.tools.azure_speech.base import AzureSpeechToolSpec\n",
+    "\n",
+    "speech_tool = AzureSpeechToolSpec(\n",
+    "    speech_key='your-key',\n",
+    "    region='eastus'\n",
+    ")\n",
+    "\n",
+    "agent = OpenAIAgent.from_tools(\n",
+    "    speech_tool.to_tool_list(),\n",
+    "    verbose=True,\n",
+    ")\n",
+    "print(agent.chat('Say \"hello world\"'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "71c0244c-d9f3-4849-8571-50cee19f705b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Calling Function ===\n",
+      "Calling function: speech_to_text with args: {\n",
+      "  \"filename\": \"data/speech.wav\"\n",
+      "}\n",
+      "Got output: ['Hello, thank you for calling Contoso. Who am I speaking with today?', \"Hi, my name is Mary Rondo. I'm trying to enroll myself with conscious So.\", 'Hi, Mary. Uh, are you calling because you need health insurance?', \"Yes, yeah, I'm calling to sign up for insurance.\", 'Great. Uh, if you can answer a few questions, uh, we can get you signed up in the jiffy.', 'OK.', \"So uh, what's your full name?\", 'Uh, so Mary Beth Rondo, last name is R like Romeo, O like Ocean, and like Nancy. DD like Dog and O like Ocean again.', \"Rando got it. And what's the best callback number in case we get disconnected?\", 'I only have a cell phone, so I can give you that.', \"Yeah, that'll be fine.\", \"Sure. So it's 234554 and then 9312.\", \"To confirm, it's 234-554-9312.\", \"Yep, that's right.\", \"Excellent. Uh, Let's get some additional information from your app. For your application, Uh, do you have a job?\", 'Uh, yes, I am self-employed.', 'OK, So then you have a Social Security number as well? Uh, yes, I do.', 'OK, uh, and what is your Social Security number please?', \"Uh, sure. So it's 412.\", 'Uh 256789.', 'Sorry, what was that, A-25 or A225 you cut out for a bit?', \"Uh, it's 22 so.\", '412, then another two, then five.', 'Hey, thank you so much and could I have your e-mail address please?', \"Yeah, it's Mary [email protected], so [email protected]. No periods, no dashes.\", \"Great. Uh, that is the last question. So let me take your information and I'll be able to get you signed up right away. Thank you for calling Contoso and I'll be able to get you signed up immediately. One of our agents will call you back in about 24 hours or so to confirm your application.\", 'That sounds great. Thank you.', 'Absolutely. If you need anything else, please give us a call at 1-800-555-5564 ext 123. Thank you very much for calling Contoso.', 'Uh, actually uh, sorry, one more question.', 'Ohh yes, of course.', \"I'm curious what I'd be getting a physical card as proof of coverage.\", 'So the default is a digital membership card, but we can send you a physical card if you prefer.', \"Uh, yes. Could you please mail it to me when it's ready? I'd like to have it shipped to you from my address.\", 'Ohh yeah.', \"Uh so it's 2660 Unit A on Maple Ave. SE Lansing and then zip code is 48823.\", \"Absolutely. I've made a note on your file.\", 'Awesome. Thanks so much.', \"You're very welcome. Thank you for calling Contoso and have a great day.\"]\n",
+      "========================\n",
+      "The audio file contains a conversation between a customer named Mary Rondo and a representative from Contoso. Mary is calling to enroll herself in health insurance. The representative asks for Mary's full name, callback number, and email address. They also ask about Mary's employment status and Social Security number. The conversation ends with the representative confirming that they will get Mary signed up and that an agent will call her back to confirm the application. Mary asks about receiving a physical card as proof of coverage and provides her mailing address. The representative notes the request and thanks Mary for calling Contoso.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(agent.chat('summarize the data/speech.wav audio file into a few sentences'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb814f3b-abd6-4910-8c53-5c6001fda04c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(agent.chat('rate the customer service agents performance based on the conversation'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9030a767-8708-4642-9232-1f62ffe97b50",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/llama_hub/tools/notebooks/data/speech.wav b/llama_hub/tools/notebooks/data/speech.wav