Adding Azure computer vision tool

run-llama · Aug 9, 2023 · fd8b7c5 · fd8b7c5
1 parent 58bee02
commit fd8b7c5
Show file tree

Hide file tree

Showing 5 changed files with 206 additions and 0 deletions.
diff --git a/llama_hub/tools/azure_cv/README.md b/llama_hub/tools/azure_cv/README.md
@@ -0,0 +1,27 @@
+# Azure Computer Vision Tool
+
+This tool connects to a Azure account and allows an Agent to perform a variety of computer vision tasks on image urls.
+
+You will need to set up an api key and computer vision instance using Azure, learn more here: https://azure.microsoft.com/en-ca/products/cognitive-services/computer-vision
+
+## Usage
+
+This tool has more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_vision.ipynb)
+
+Here's an example usage of the AzureCVToolSpec.
+
+```python
+from llama_hub.tools.google_search.base import AzureCVToolSpec
+from llama_index.agent import OpenAIAgent
+
+tool_spec = AzureCVToolSpec(api_key='your-key', resource='your-resource')
+
+agent = OpenAIAgent.from_tools(tool_spec.to_tool_list())
+
+agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png')
+agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg')
+```
+
+`process_image`: Send an image for computer vision classification of objects, tags, captioning or OCR.
+
+This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/llama_hub/tools/azure_cv/__init__.py b/llama_hub/tools/azure_cv/__init__.py
@@ -0,0 +1 @@
+## init
diff --git a/llama_hub/tools/azure_cv/base.py b/llama_hub/tools/azure_cv/base.py
@@ -0,0 +1,46 @@
+"""Azure Cognitive Vision tool spec."""
+
+from llama_index.tools.tool_spec.base import BaseToolSpec
+from typing import Optional, List
+import requests
+import urllib.parse
+
+CV_URL_TMPL = "https://{resource}.cognitiveservices.azure.com/computervision/imageanalysis:analyze"
+
+class AzureCVToolSpec(BaseToolSpec):
+    """Wolfram Alpha tool spec."""
+
+    spec_functions = ["process_image"]
+
+    def __init__(
+        self,
+        resource: str,
+        api_key: str,
+        language: Optional[str] = 'en',
+        api_version: Optional[str] = '2023-04-01-preview'
+    ) -> None:
+        """Initialize with parameters."""
+        self.api_key = api_key
+        self.cv_url = CV_URL_TMPL.format(resource=resource)
+        self.language = language
+        self.api_version = api_version
+
+    def process_image(self, url: str, features: List[str]):
+        """
+        This tool accepts an image url or file and can process and return a variety of text depending on the use case.
+        You can use the features argument to configure what text you want returned.
+
+        args:
+            url (str): The url for the image to caption
+            features (List[str]): Instructions on how to process the image. Valid keys are tags, objects, read, caption
+        """
+        response = requests.post(
+            f'{self.cv_url}?features={",".join(features)}&language={self.language}&api-version={self.api_version}',
+            headers={ 'Ocp-Apim-Subscription-Key': self.api_key },
+            json={'url': url}
+        )
+        response_json = response.json()
+        if 'read' in features:
+            response_json['readResult'] = response_json['readResult']['content']
+
+        return response_json
diff --git a/llama_hub/tools/library.json b/llama_hub/tools/library.json
@@ -1,4 +1,9 @@
 {
+  "AzureCVToolSpec": {
+    "id": "tools/azure_cv",
+    "author": "ajhofmann",
+    "keywords": ["image", "vision", "cv"]
+  },
   "ChatGPTPluginToolSpec": {
     "id": "tools/chatgpt_plugin",
     "author": "ajhofmann"

diff --git a/llama_hub/tools/notebooks/azure_vision.ipynb b/llama_hub/tools/notebooks/azure_vision.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "34e66a0e-e41d-48e0-8a1f-b82b5ea18ad1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup OpenAI Agent\n",
+    "import openai\n",
+    "openai.api_key = 'sk-your-key'\n",
+    "from llama_index.agent import OpenAIAgent\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "eb11c1a6-1540-4538-8d1a-bb8b265fdb64",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Calling Function ===\n",
+      "Calling function: process_image with args: {\n",
+      "  \"url\": \"https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png\",\n",
+      "  \"features\": [\"caption\", \"tags\"]\n",
+      "}\n",
+      "Got output: {'captionResult': {'text': 'a group of cows grazing in a field', 'confidence': 0.861102819442749}, 'modelVersion': '2023-02-01-preview', 'metadata': {'width': 375, 'height': 250}, 'tagsResult': {'values': [{'name': 'grass', 'confidence': 0.9988070130348206}, {'name': 'outdoor', 'confidence': 0.9931809306144714}, {'name': 'field', 'confidence': 0.9857797622680664}, {'name': 'animal', 'confidence': 0.9708199501037598}, {'name': 'livestock', 'confidence': 0.965355634689331}, {'name': 'cow', 'confidence': 0.954204797744751}, {'name': 'herd', 'confidence': 0.9496941566467285}, {'name': 'ranch', 'confidence': 0.9301772117614746}, {'name': 'mammal', 'confidence': 0.9299676418304443}, {'name': 'dairy cow', 'confidence': 0.9291023015975952}, {'name': 'bovine', 'confidence': 0.9199285507202148}, {'name': 'herding', 'confidence': 0.8967740535736084}, {'name': 'fodder', 'confidence': 0.8817697763442993}, {'name': 'grassland', 'confidence': 0.8811800479888916}, {'name': 'standing', 'confidence': 0.8034635782241821}, {'name': 'pasture', 'confidence': 0.6391813158988953}, {'name': 'grazing', 'confidence': 0.6333702802658081}, {'name': 'farm', 'confidence': 0.6285721063613892}, {'name': 'cattle', 'confidence': 0.5256974697113037}, {'name': 'landscape', 'confidence': 0.4293440878391266}]}}\n",
+      "========================\n",
+      "The caption for the image is \"a group of cows grazing in a field\". \n",
+      "\n",
+      "The tags in the image include: grass, outdoor, field, animal, livestock, cow, herd, ranch, mammal, dairy cow, bovine, herding, fodder, grassland, standing, pasture, grazing, farm, cattle, and landscape.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llama_hub.tools.azure_cv.base import AzureCVToolSpec\n",
+    "\n",
+    "cv_tool = AzureCVToolSpec(\n",
+    "    api_key='your-key',\n",
+    "    resource='your-resource'\n",
+    ")\n",
+    "\n",
+    "agent = OpenAIAgent.from_tools(\n",
+    "    cv_tool.to_tool_list(),\n",
+    "    verbose=True,\n",
+    ")\n",
+    "\n",
+    "print(agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d6be81e1-41a6-48b6-920b-b225c0f16a9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Calling Function ===\n",
+      "Calling function: process_image with args: {\n",
+      "  \"url\": \"https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg\",\n",
+      "  \"features\": [\"caption\", \"read\"]\n",
+      "}\n",
+      "Got output: {'captionResult': {'text': 'close-up of a nutrition label', 'confidence': 0.822258710861206}, 'readResult': 'Nutrition Facts Amount Per Serving\\nServing size: 1 bar (40g)\\nServing Per Package: 4\\nTotal Fat 13g\\nSaturated Fat 1.5g\\nAmount Per Serving\\nTrans Fat 0g\\ncalories 190\\nCholesterol 0mg\\nories from Fat 110\\nSodium 20mg\\nnt Daily Values are based on\\nVitamin A 50\\ncalorie diet', 'modelVersion': '2023-02-01-preview', 'metadata': {'width': 1254, 'height': 704}}\n",
+      "========================\n",
+      "The caption for the image is \"close-up of a nutrition label\".\n",
+      "\n",
+      "The text from the image is as follows:\n",
+      "\n",
+      "\"Nutrition Facts Amount Per Serving\n",
+      "Serving size: 1 bar (40g)\n",
+      "Serving Per Package: 4\n",
+      "Total Fat 13g\n",
+      "Saturated Fat 1.5g\n",
+      "Amount Per Serving\n",
+      "Trans Fat 0g\n",
+      "calories 190\n",
+      "Cholesterol 0mg\n",
+      "ories from Fat 110\n",
+      "Sodium 20mg\n",
+      "nt Daily Values are based on\n",
+      "Vitamin A 50\n",
+      "calorie diet\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "708cc1e0-199b-48a6-a88b-17af19b3f518",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}