Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Adding Azure computer vision tool
Browse files Browse the repository at this point in the history
  • Loading branch information
ajhofmann committed Aug 9, 2023
1 parent 58bee02 commit fd8b7c5
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 0 deletions.
27 changes: 27 additions & 0 deletions llama_hub/tools/azure_cv/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Azure Computer Vision Tool

This tool connects to a Azure account and allows an Agent to perform a variety of computer vision tasks on image urls.

You will need to set up an api key and computer vision instance using Azure, learn more here: https://azure.microsoft.com/en-ca/products/cognitive-services/computer-vision

## Usage

This tool has more extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/azure_vision.ipynb)

Here's an example usage of the AzureCVToolSpec.

```python
from llama_hub.tools.google_search.base import AzureCVToolSpec
from llama_index.agent import OpenAIAgent

tool_spec = AzureCVToolSpec(api_key='your-key', resource='your-resource')

agent = OpenAIAgent.from_tools(tool_spec.to_tool_list())

agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png')
agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg')
```

`process_image`: Send an image for computer vision classification of objects, tags, captioning or OCR.

This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
1 change: 1 addition & 0 deletions llama_hub/tools/azure_cv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
## init
46 changes: 46 additions & 0 deletions llama_hub/tools/azure_cv/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Azure Cognitive Vision tool spec."""

from llama_index.tools.tool_spec.base import BaseToolSpec
from typing import Optional, List
import requests
import urllib.parse

CV_URL_TMPL = "https://{resource}.cognitiveservices.azure.com/computervision/imageanalysis:analyze"

class AzureCVToolSpec(BaseToolSpec):
"""Wolfram Alpha tool spec."""

spec_functions = ["process_image"]

def __init__(
self,
resource: str,
api_key: str,
language: Optional[str] = 'en',
api_version: Optional[str] = '2023-04-01-preview'
) -> None:
"""Initialize with parameters."""
self.api_key = api_key
self.cv_url = CV_URL_TMPL.format(resource=resource)
self.language = language
self.api_version = api_version

def process_image(self, url: str, features: List[str]):
"""
This tool accepts an image url or file and can process and return a variety of text depending on the use case.
You can use the features argument to configure what text you want returned.
args:
url (str): The url for the image to caption
features (List[str]): Instructions on how to process the image. Valid keys are tags, objects, read, caption
"""
response = requests.post(
f'{self.cv_url}?features={",".join(features)}&language={self.language}&api-version={self.api_version}',
headers={ 'Ocp-Apim-Subscription-Key': self.api_key },
json={'url': url}
)
response_json = response.json()
if 'read' in features:
response_json['readResult'] = response_json['readResult']['content']

return response_json
5 changes: 5 additions & 0 deletions llama_hub/tools/library.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
{
"AzureCVToolSpec": {
"id": "tools/azure_cv",
"author": "ajhofmann",
"keywords": ["image", "vision", "cv"]
},
"ChatGPTPluginToolSpec": {
"id": "tools/chatgpt_plugin",
"author": "ajhofmann"
Expand Down
127 changes: 127 additions & 0 deletions llama_hub/tools/notebooks/azure_vision.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "34e66a0e-e41d-48e0-8a1f-b82b5ea18ad1",
"metadata": {},
"outputs": [],
"source": [
"# Setup OpenAI Agent\n",
"import openai\n",
"openai.api_key = 'sk-your-key'\n",
"from llama_index.agent import OpenAIAgent\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "eb11c1a6-1540-4538-8d1a-bb8b265fdb64",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Calling Function ===\n",
"Calling function: process_image with args: {\n",
" \"url\": \"https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png\",\n",
" \"features\": [\"caption\", \"tags\"]\n",
"}\n",
"Got output: {'captionResult': {'text': 'a group of cows grazing in a field', 'confidence': 0.861102819442749}, 'modelVersion': '2023-02-01-preview', 'metadata': {'width': 375, 'height': 250}, 'tagsResult': {'values': [{'name': 'grass', 'confidence': 0.9988070130348206}, {'name': 'outdoor', 'confidence': 0.9931809306144714}, {'name': 'field', 'confidence': 0.9857797622680664}, {'name': 'animal', 'confidence': 0.9708199501037598}, {'name': 'livestock', 'confidence': 0.965355634689331}, {'name': 'cow', 'confidence': 0.954204797744751}, {'name': 'herd', 'confidence': 0.9496941566467285}, {'name': 'ranch', 'confidence': 0.9301772117614746}, {'name': 'mammal', 'confidence': 0.9299676418304443}, {'name': 'dairy cow', 'confidence': 0.9291023015975952}, {'name': 'bovine', 'confidence': 0.9199285507202148}, {'name': 'herding', 'confidence': 0.8967740535736084}, {'name': 'fodder', 'confidence': 0.8817697763442993}, {'name': 'grassland', 'confidence': 0.8811800479888916}, {'name': 'standing', 'confidence': 0.8034635782241821}, {'name': 'pasture', 'confidence': 0.6391813158988953}, {'name': 'grazing', 'confidence': 0.6333702802658081}, {'name': 'farm', 'confidence': 0.6285721063613892}, {'name': 'cattle', 'confidence': 0.5256974697113037}, {'name': 'landscape', 'confidence': 0.4293440878391266}]}}\n",
"========================\n",
"The caption for the image is \"a group of cows grazing in a field\". \n",
"\n",
"The tags in the image include: grass, outdoor, field, animal, livestock, cow, herd, ranch, mammal, dairy cow, bovine, herding, fodder, grassland, standing, pasture, grazing, farm, cattle, and landscape.\n"
]
}
],
"source": [
"from llama_hub.tools.azure_cv.base import AzureCVToolSpec\n",
"\n",
"cv_tool = AzureCVToolSpec(\n",
" api_key='your-key',\n",
" resource='your-resource'\n",
")\n",
"\n",
"agent = OpenAIAgent.from_tools(\n",
" cv_tool.to_tool_list(),\n",
" verbose=True,\n",
")\n",
"\n",
"print(agent.chat('caption this image and tell me what tags are in it https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png'))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d6be81e1-41a6-48b6-920b-b225c0f16a9b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Calling Function ===\n",
"Calling function: process_image with args: {\n",
" \"url\": \"https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg\",\n",
" \"features\": [\"caption\", \"read\"]\n",
"}\n",
"Got output: {'captionResult': {'text': 'close-up of a nutrition label', 'confidence': 0.822258710861206}, 'readResult': 'Nutrition Facts Amount Per Serving\\nServing size: 1 bar (40g)\\nServing Per Package: 4\\nTotal Fat 13g\\nSaturated Fat 1.5g\\nAmount Per Serving\\nTrans Fat 0g\\ncalories 190\\nCholesterol 0mg\\nories from Fat 110\\nSodium 20mg\\nnt Daily Values are based on\\nVitamin A 50\\ncalorie diet', 'modelVersion': '2023-02-01-preview', 'metadata': {'width': 1254, 'height': 704}}\n",
"========================\n",
"The caption for the image is \"close-up of a nutrition label\".\n",
"\n",
"The text from the image is as follows:\n",
"\n",
"\"Nutrition Facts Amount Per Serving\n",
"Serving size: 1 bar (40g)\n",
"Serving Per Package: 4\n",
"Total Fat 13g\n",
"Saturated Fat 1.5g\n",
"Amount Per Serving\n",
"Trans Fat 0g\n",
"calories 190\n",
"Cholesterol 0mg\n",
"ories from Fat 110\n",
"Sodium 20mg\n",
"nt Daily Values are based on\n",
"Vitamin A 50\n",
"calorie diet\"\n"
]
}
],
"source": [
"print(agent.chat('caption this image and read any text https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "708cc1e0-199b-48a6-a88b-17af19b3f518",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit fd8b7c5

Please sign in to comment.