diff --git a/llama_hub/tools/azure_cv/README.md b/llama_hub/tools/azure_cv/README.md new file mode 100644 index 0000000000..29882f53e9 --- /dev/null +++ b/llama_hub/tools/azure_cv/README.md @@ -0,0 +1,27 @@ +# Azure Computer Vision Tool + +This tool connects to a Azure account and allows an Agent to perform a variety of computer vision tasks on image urls. + +You will need to set up an api key and computer vision instance using Azure, learn more here: https://azure.microsoft.com/en-ca/products/cognitive-services/computer-vision + +## Usage + +This tool has more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_vision.ipynb) + +Here's an example usage of the AzureCVToolSpec. + +```python +from llama_hub.tools.google_search.base import AzureCVToolSpec +from llama_index.agent import OpenAIAgent + +tool_spec = AzureCVToolSpec(api_key='your-key', resource='your-resource') + +agent = OpenAIAgent.from_tools(tool_spec.to_tool_list()) + +agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png') +agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg') +``` + +`process_image`: Send an image for computer vision classification of objects, tags, captioning or OCR. + +This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/llama_hub/tools/azure_cv/__init__.py b/llama_hub/tools/azure_cv/__init__.py new file mode 100644 index 0000000000..99bc496405 --- /dev/null +++ b/llama_hub/tools/azure_cv/__init__.py @@ -0,0 +1 @@ +## init \ No newline at end of file diff --git a/llama_hub/tools/azure_cv/base.py b/llama_hub/tools/azure_cv/base.py new file mode 100644 index 0000000000..e16d3a0482 --- /dev/null +++ b/llama_hub/tools/azure_cv/base.py @@ -0,0 +1,46 @@ +"""Azure Cognitive Vision tool spec.""" + +from llama_index.tools.tool_spec.base import BaseToolSpec +from typing import Optional, List +import requests +import urllib.parse + +CV_URL_TMPL = "https://{resource}.cognitiveservices.azure.com/computervision/imageanalysis:analyze" + +class AzureCVToolSpec(BaseToolSpec): + """Wolfram Alpha tool spec.""" + + spec_functions = ["process_image"] + + def __init__( + self, + resource: str, + api_key: str, + language: Optional[str] = 'en', + api_version: Optional[str] = '2023-04-01-preview' + ) -> None: + """Initialize with parameters.""" + self.api_key = api_key + self.cv_url = CV_URL_TMPL.format(resource=resource) + self.language = language + self.api_version = api_version + + def process_image(self, url: str, features: List[str]): + """ + This tool accepts an image url or file and can process and return a variety of text depending on the use case. + You can use the features argument to configure what text you want returned. + + args: + url (str): The url for the image to caption + features (List[str]): Instructions on how to process the image. Valid keys are tags, objects, read, caption + """ + response = requests.post( + f'{self.cv_url}?features={",".join(features)}&language={self.language}&api-version={self.api_version}', + headers={ 'Ocp-Apim-Subscription-Key': self.api_key }, + json={'url': url} + ) + response_json = response.json() + if 'read' in features: + response_json['readResult'] = response_json['readResult']['content'] + + return response_json \ No newline at end of file diff --git a/llama_hub/tools/library.json b/llama_hub/tools/library.json index c1d7f4d90c..70e8759239 100644 --- a/llama_hub/tools/library.json +++ b/llama_hub/tools/library.json @@ -1,4 +1,9 @@ { + "AzureCVToolSpec": { + "id": "tools/azure_cv", + "author": "ajhofmann", + "keywords": ["image", "vision", "cv"] + }, "ChatGPTPluginToolSpec": { "id": "tools/chatgpt_plugin", "author": "ajhofmann" diff --git a/llama_hub/tools/notebooks/azure_vision.ipynb b/llama_hub/tools/notebooks/azure_vision.ipynb new file mode 100644 index 0000000000..de6b2ce019 --- /dev/null +++ b/llama_hub/tools/notebooks/azure_vision.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "34e66a0e-e41d-48e0-8a1f-b82b5ea18ad1", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup OpenAI Agent\n", + "import openai\n", + "openai.api_key = 'sk-your-key'\n", + "from llama_index.agent import OpenAIAgent\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "eb11c1a6-1540-4538-8d1a-bb8b265fdb64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Calling Function ===\n", + "Calling function: process_image with args: {\n", + " \"url\": \"https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png\",\n", + " \"features\": [\"caption\", \"tags\"]\n", + "}\n", + "Got output: {'captionResult': {'text': 'a group of cows grazing in a field', 'confidence': 0.861102819442749}, 'modelVersion': '2023-02-01-preview', 'metadata': {'width': 375, 'height': 250}, 'tagsResult': {'values': [{'name': 'grass', 'confidence': 0.9988070130348206}, {'name': 'outdoor', 'confidence': 0.9931809306144714}, {'name': 'field', 'confidence': 0.9857797622680664}, {'name': 'animal', 'confidence': 0.9708199501037598}, {'name': 'livestock', 'confidence': 0.965355634689331}, {'name': 'cow', 'confidence': 0.954204797744751}, {'name': 'herd', 'confidence': 0.9496941566467285}, {'name': 'ranch', 'confidence': 0.9301772117614746}, {'name': 'mammal', 'confidence': 0.9299676418304443}, {'name': 'dairy cow', 'confidence': 0.9291023015975952}, {'name': 'bovine', 'confidence': 0.9199285507202148}, {'name': 'herding', 'confidence': 0.8967740535736084}, {'name': 'fodder', 'confidence': 0.8817697763442993}, {'name': 'grassland', 'confidence': 0.8811800479888916}, {'name': 'standing', 'confidence': 0.8034635782241821}, {'name': 'pasture', 'confidence': 0.6391813158988953}, {'name': 'grazing', 'confidence': 0.6333702802658081}, {'name': 'farm', 'confidence': 0.6285721063613892}, {'name': 'cattle', 'confidence': 0.5256974697113037}, {'name': 'landscape', 'confidence': 0.4293440878391266}]}}\n", + "========================\n", + "The caption for the image is \"a group of cows grazing in a field\". \n", + "\n", + "The tags in the image include: grass, outdoor, field, animal, livestock, cow, herd, ranch, mammal, dairy cow, bovine, herding, fodder, grassland, standing, pasture, grazing, farm, cattle, and landscape.\n" + ] + } + ], + "source": [ + "from llama_hub.tools.azure_cv.base import AzureCVToolSpec\n", + "\n", + "cv_tool = AzureCVToolSpec(\n", + " api_key='your-key',\n", + " resource='your-resource'\n", + ")\n", + "\n", + "agent = OpenAIAgent.from_tools(\n", + " cv_tool.to_tool_list(),\n", + " verbose=True,\n", + ")\n", + "\n", + "print(agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png'))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d6be81e1-41a6-48b6-920b-b225c0f16a9b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Calling Function ===\n", + "Calling function: process_image with args: {\n", + " \"url\": \"https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg\",\n", + " \"features\": [\"caption\", \"read\"]\n", + "}\n", + "Got output: {'captionResult': {'text': 'close-up of a nutrition label', 'confidence': 0.822258710861206}, 'readResult': 'Nutrition Facts Amount Per Serving\\nServing size: 1 bar (40g)\\nServing Per Package: 4\\nTotal Fat 13g\\nSaturated Fat 1.5g\\nAmount Per Serving\\nTrans Fat 0g\\ncalories 190\\nCholesterol 0mg\\nories from Fat 110\\nSodium 20mg\\nnt Daily Values are based on\\nVitamin A 50\\ncalorie diet', 'modelVersion': '2023-02-01-preview', 'metadata': {'width': 1254, 'height': 704}}\n", + "========================\n", + "The caption for the image is \"close-up of a nutrition label\".\n", + "\n", + "The text from the image is as follows:\n", + "\n", + "\"Nutrition Facts Amount Per Serving\n", + "Serving size: 1 bar (40g)\n", + "Serving Per Package: 4\n", + "Total Fat 13g\n", + "Saturated Fat 1.5g\n", + "Amount Per Serving\n", + "Trans Fat 0g\n", + "calories 190\n", + "Cholesterol 0mg\n", + "ories from Fat 110\n", + "Sodium 20mg\n", + "nt Daily Values are based on\n", + "Vitamin A 50\n", + "calorie diet\"\n" + ] + } + ], + "source": [ + "print(agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "708cc1e0-199b-48a6-a88b-17af19b3f518", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}