-
Notifications
You must be signed in to change notification settings - Fork 0
/
RAG.py
171 lines (128 loc) · 5.3 KB
/
RAG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# -*- coding: utf-8 -*-
"""twelve lab api.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1xJzhMEDw7hK632kbE6CsDz-oWXPNwbUy
"""
!pip install twelvelabs
!pip install chromadb
!pip install requests
from twelvelabs import TwelveLabs
import chromadb
import requests
import glob
from pprint import pprint
import os
# Commented out IPython magic to ensure Python compatibility.
# Specify the API key and the URL of the API
# %env API_KEY=tlk_0PXJBFR3B1YG7X2MKVKF007HP284
# %env API_URL=https://api.twelvelabs.io/p/v1.2
# Retrieve the URL of the API and my API key
API_URL = os.getenv("API_URL")
assert API_URL
API_KEY = os.getenv("API_KEY")
assert API_KEY
client = TwelveLabs(api_key=os.getenv('API_KEY'))
"""# Video Embedding"""
url = "https://api.twelvelabs.io/v1.2/embed/tasks"
files = { "video_file": ("testvid.mp4", open("/content/testvid.mp4", "rb"), "video/mp4") }#the path to the video
payload = {
"engine_name": "Marengo-retrieval-2.6",
"video_embedding_scope": "clip"
}
headers = {
"accept": "application/json",
"x-api-key": "tlk_0PXJBFR3B1YG7X2MKVKF007HP284"
}
response = requests.post(url, data=payload, files=files, headers=headers)
print(response.text)
client = TwelveLabs(api_key=os.getenv('API_KEY'))
task = client.embed.task.create(
engine_name="Marengo-retrieval-2.6",
video_file="/content/testvid.mp4"
)
def on_task_update(task):
print(f" Status={task.status}")
status = task.wait_for_done(
sleep_interval=2,
callback=on_task_update
)
print(f"Embedding done: {status}")
task = client.embed.task.retrieve(task.id)
if task.video_embeddings is not None:
for v in task.video_embeddings:
print(
f"embedding_scope={v.embedding_scope} start_offset_sec={v.start_offset_sec} end_offset_sec={v.end_offset_sec}"
)
print(f"embeddings: {', '.join([str(x) for x in v.embedding.float])}")
"""as you are uploading the video to their database and you will get the video id here and you will add it to chroma when passing the video embeddings"""
"""#Chroma
Saving the embeddings and url or video id in Chroma Vector database
"""
client = chromadb.Client()
collection = client.get_or_create_collection("test")
# Function to split embeddings into a list of lists with three values each
def split_embeddings(embeddings):
return [embeddings[i:i+3] for i in range(0, len(embeddings), 3)]
if task.video_embeddings is not None:
embeddings = []
metadatas = []
ids = []
id_counter = 1
for v in task.video_embeddings:
# Split the embedding into sublists of three values each
split_embedding = split_embeddings(v.embedding.float)
# Create metadata for each sublist
metadata = {
"embedding_scope": v.embedding_scope,
"start_offset_sec": v.start_offset_sec,
"end_offset_sec": v.end_offset_sec,
"video_id": "66ae91e8fe45e78ff2976a75" # Replace with the actual video ID
}
# Add each sublist of the embedding along with its metadata
for sub_embedding in split_embedding:
if len(sub_embedding) == 3: # Ensure the sublist has exactly three values
embeddings.append(sub_embedding) # add the sublist of floats
metadatas.append(metadata)
ids.append(str(id_counter)) # generate incremental IDs
id_counter += 1
# Upsert embeddings and metadata into Chroma collection
collection.upsert(embeddings=embeddings, metadatas=metadatas,ids=ids)
print(f"Upserted {len(embeddings)} embeddings into Chroma collection.")
else:
print("No video embeddings found for this task.")
"""#Text Embedding"""
client = TwelveLabs(api_key=API_KEY)
embedding = client.embed.create(
engine_name="Marengo-retrieval-2.6",
text="marketing",
text_truncate="start",
)
print("Created a text embedding")
print(f" Engine: {embedding.engine_name}")
print(f" Embedding: {embedding.text_embedding.float}")
query_result = collection.query(#[embedding.text_embedding.float],###check dimensions and if matches how its saved in chroma
query_embeddings=[embedding.text_embedding.float[i:i+3] for i in range(0, len(embedding.text_embedding.float), 3)][0],
n_results=2,
)
print(query_result)##from this result of retrieving we need the video id or url of the video the most close to the query or prompt
#example of result:# {'ids': [['id1', 'id2']], 'distances': [[1.0404009819030762, 1.2430799007415771]], 'metadatas': [[None, None]], 'embeddings': None, 'documents': [['This is a document about pineapple', 'This is a document about oranges']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
video_ids=[]
for metadata in query_result['metadatas'][0]:
video_ids.append(metadata['video_id'])
"""#Generate Endpoint"""
# Define the /generate endpoint
GENERATE_URL = "https://api.twelvelabs.io/v1.2/generate"
# Set the header of the request
headers = {
"x-api-key": API_KEY
}
# Declare a dictionary named `data` # you can only pass video URL instead of video_id
data = {
"video_id": video_ids[0],
"prompt": "What's this video about?"#same prompt can be here
}
# Make a generation request
response = requests.post(GENERATE_URL, headers=headers, json=data)
print(f"Status code: {response.status_code}")
pprint(response.json())