feat: enhance multimodal support for images and audio in instructor (#…

…1212)
instructor-ai · Nov 23, 2024 · 068d183 · 068d183
1 parent e06e315
commit 068d183
Show file tree

Hide file tree

Showing 65 changed files with 609 additions and 358 deletions.
diff --git a/README.md b/README.md
@@ -66,27 +66,34 @@ import instructor
 from openai import OpenAI
 from pydantic import BaseModel
 
+
 class UserInfo(BaseModel):
     name: str
     age: int
 
+
 # Initialize the OpenAI client with Instructor
 client = instructor.from_openai(OpenAI())
 
+
 # Define hook functions
 def log_kwargs(**kwargs):
     print(f"Function called with kwargs: {kwargs}")
 
+
 def log_exception(exception: Exception):
     print(f"An exception occurred: {str(exception)}")
 
+
 client.on("completion:kwargs", log_kwargs)
 client.on("completion:error", log_exception)
 
 user_info = client.chat.completions.create(
     model="gpt-4o-mini",
     response_model=UserInfo,
-    messages=[{"role": "user", "content": "Extract the user name: 'John is 20 years old'"}],
+    messages=[
+        {"role": "user", "content": "Extract the user name: 'John is 20 years old'"}
+    ],
 )
 
 """

diff --git a/docs/blog/posts/announcing-gemini-tool-calling-support.md b/docs/blog/posts/announcing-gemini-tool-calling-support.md
@@ -66,7 +66,7 @@ class User(BaseModel):
 
 client = instructor.from_gemini(
     client=genai.GenerativeModel(
-        model_name="models/gemini-1.5-flash-latest", # (1)!
+        model_name="models/gemini-1.5-flash-latest",  # (1)!
     )
 )
 
@@ -105,7 +105,7 @@ class User(BaseModel):
 
 
 client = instructor.from_vertexai(
-    client=GenerativeModel("gemini-1.5-pro-preview-0409"), # (1)!
+    client=GenerativeModel("gemini-1.5-pro-preview-0409"),  # (1)!
 )
 
 

diff --git a/docs/blog/posts/anthropic-prompt-caching.md b/docs/blog/posts/anthropic-prompt-caching.md
@@ -187,7 +187,6 @@ Let's first initialize our Anthropic client, this will be the same as what we've
 ```python
 from instructor import Instructor, Mode, patch
 from anthropic import Anthropic
-from pydantic import BaseModel
 
 
 client = Instructor(
@@ -203,9 +202,10 @@ client = Instructor(
 We'll then create a new `Character` class that will be used to extract out a single character from the text and read in our source text ( roughly 2856 tokens using the Anthropic tokenizer).
 
 ```python
-with open("./book.txt", "r") as f:
+with open("./book.txt") as f:
     book = f.read()
 
+
 class Character(BaseModel):
     name: str
     description: str
@@ -215,7 +215,7 @@ Once we've done this, we can then make an api call to get the description of the
 
 ```python
 for _ in range(2):
-    resp, completion = client.chat.completions.create_with_completion( # (1)!
+    resp, completion = client.chat.completions.create_with_completion(  # (1)!
         model="claude-3-haiku-20240307",
         messages=[
             {
@@ -224,7 +224,7 @@ for _ in range(2):
                     {
                         "type": "text",
                         "text": "<book>" + book + "</book>",
-                        "cache_control": {"type": "ephemeral"}, # (2)!
+                        "cache_control": {"type": "ephemeral"},  # (2)!
                     },
                     {
                         "type": "text",
@@ -238,7 +238,7 @@ for _ in range(2):
     )
     assert isinstance(resp, Character)
 
-    print(completion.usage) # (3)!
+    print(completion.usage)  # (3)!
     print(resp)
 ```
 
@@ -307,7 +307,7 @@ class Character(BaseModel):
     description: str
 
 
-with open("./book.txt", "r") as f:
+with open("./book.txt") as f:
     book = f.read()
 
 for _ in range(2):

diff --git a/docs/blog/posts/bad-schemas-could-break-llms.md b/docs/blog/posts/bad-schemas-could-break-llms.md
@@ -49,6 +49,7 @@ from datasets import load_dataset, Dataset, DatasetDict
 
 splits = ["test", "train"]
 
+
 def generate_gsm8k(split):
     ds = load_dataset("gsm8k", "main", split=split, streaming=True)
     for row in ds:
@@ -60,6 +61,7 @@ def generate_gsm8k(split):
             "reasoning": reasoning,
         }
 
+
 # Create the dataset for train and test splits
 train_dataset = Dataset.from_generator(lambda: generate_gsm8k("train"))
 test_dataset = Dataset.from_generator(lambda: generate_gsm8k("test"))
@@ -143,6 +145,7 @@ class Answer(BaseModel):
     chain_of_thought: str
     answer: int
 
+
 class OnlyAnswer(BaseModel):
     answer: int
 ```
@@ -214,22 +217,26 @@ class Answer(BaseModel):
     chain_of_thought: str
     answer: int
 
+
 class AnswerWithCalculation(BaseModel):
     chain_of_thought: str
     required_calculations: list[str]
     answer: int
 
+
 class AssumptionBasedAnswer(BaseModel):
     assumptions: list[str]
     logic_flow: str
     answer: int
 
+
 class ErrorAwareCalculation(BaseModel):
     key_steps: list[str]
     potential_pitfalls: list[str]
     intermediate_results: list[str]
     answer: int
 
+
 class AnswerWithNecessaryCalculationAndFinalChoice(BaseModel):
     chain_of_thought: str
     necessary_calculations: list[str]
@@ -279,43 +286,33 @@ In fact, the only thing that changed was the last two parameters. Upon closer in
 
 ```python
 {
-  "chain_of_thought": "In the race, there are a total of 240 Asians. Given that 80 were Japanese, we can calculate the number of Chinese participants by subtracting the number of Japanese from the total number of Asians: 240 - 80 = 160. Now, it is given that there are 60 boys on the Chinese team. Therefore, to find the number of girls on the Chinese team, we subtract the number of boys from the total number of Chinese participants: 160 - 60 = 100 girls. Thus, the number of girls on the Chinese team is 100.",
-  "necessary_calculations": [
-    "Total Asians = 240",
-    "Japanese participants = 80",
-    "Chinese participants = Total Asians - Japanese participants = 240 - 80 = 160",
-    "Boys in Chinese team = 60",
-    "Girls in Chinese team = Chinese participants - Boys in Chinese team = 160 - 60 = 100"
-  ],
-  "potential_final_choices": [
-    "60",
-    "100",
-    "80",
-    "120"
-  ],
-  "final_choice": 2
+    "chain_of_thought": "In the race, there are a total of 240 Asians. Given that 80 were Japanese, we can calculate the number of Chinese participants by subtracting the number of Japanese from the total number of Asians: 240 - 80 = 160. Now, it is given that there are 60 boys on the Chinese team. Therefore, to find the number of girls on the Chinese team, we subtract the number of boys from the total number of Chinese participants: 160 - 60 = 100 girls. Thus, the number of girls on the Chinese team is 100.",
+    "necessary_calculations": [
+        "Total Asians = 240",
+        "Japanese participants = 80",
+        "Chinese participants = Total Asians - Japanese participants = 240 - 80 = 160",
+        "Boys in Chinese team = 60",
+        "Girls in Chinese team = Chinese participants - Boys in Chinese team = 160 - 60 = 100",
+    ],
+    "potential_final_choices": ["60", "100", "80", "120"],
+    "final_choice": 2,
 }
 ```
 
 This meant that instead of the final answer of 100, our model was generating potential responses it could give and returning the final choice as the index of that answer. Simply renaming our response model here to `potential_final_answers` and `final_answer` resulted in the original result of `95%` again.
 
 ```python
 {
-  "chain_of_thought": "First, we need to determine how many Asians were Chinese. Since there were 240 Asians in total and 80 of them were Japanese, we can find the number of Chinese by subtracting the number of Japanese from the total: 240 - 80 = 160. Now, we know that there are 160 Chinese participants. Given that there were 60 boys on the Chinese team, we can find the number of girls by subtracting the number of boys from the total number of Chinese: 160 - 60 = 100. Therefore, there are 100 girls on the Chinese team.",
-  "necessary_calculations": [
-    "Total Asians = 240",
-    "Number of Japanese = 80",
-    "Number of Chinese = 240 - 80 = 160",
-    "Number of boys on Chinese team = 60",
-    "Number of girls on Chinese team = 160 - 60 = 100"
-  ],
-  "potential_final_answers": [
-    "100",
-    "60",
-    "80",
-    "40"
-  ],
-  "answer": 100
+    "chain_of_thought": "First, we need to determine how many Asians were Chinese. Since there were 240 Asians in total and 80 of them were Japanese, we can find the number of Chinese by subtracting the number of Japanese from the total: 240 - 80 = 160. Now, we know that there are 160 Chinese participants. Given that there were 60 boys on the Chinese team, we can find the number of girls by subtracting the number of boys from the total number of Chinese: 160 - 60 = 100. Therefore, there are 100 girls on the Chinese team.",
+    "necessary_calculations": [
+        "Total Asians = 240",
+        "Number of Japanese = 80",
+        "Number of Chinese = 240 - 80 = 160",
+        "Number of boys on Chinese team = 60",
+        "Number of girls on Chinese team = 160 - 60 = 100",
+    ],
+    "potential_final_answers": ["100", "60", "80", "40"],
+    "answer": 100,
 }
 ```
 

diff --git a/docs/blog/posts/best_framework.md b/docs/blog/posts/best_framework.md
@@ -31,25 +31,27 @@ Here's an example of extracting structured user data from an LLM:
 from pydantic import BaseModel
 import instructor
 
+
 class User(BaseModel):
     name: str
     age: int
 
+
 client = instructor.from_openai(openai.OpenAI())
 
 user = client.chat.completions.create(
     model="gpt-3.5-turbo",
-    response_model=User, # (1)!
+    response_model=User,  # (1)!
     messages=[
         {
             "role": "user",
-            "content": "Extract the user's name and age from this: John is 25 years old"
+            "content": "Extract the user's name and age from this: John is 25 years old",
         }
-    ]
+    ],
 )
 
-print(user) # (2)!
-# > User(name='John', age=25)
+print(user)  # (2)!
+#> User(name='John', age=25)
 ```
 
 1. Notice that now we have a new response_model parameter that we pass in to the completions.create method. This parameter lets us specify the structure we want the LLM output to be mapped to. In this case, we're using a Pydantic model called User that describes a user's name and age.

diff --git a/docs/blog/posts/caching.md b/docs/blog/posts/caching.md
@@ -107,15 +107,18 @@ print(f"Time taken: {time.perf_counter() - start}")
     def decorator(func):
         def wrapper(*args, **kwargs):
             print("Do something before")  # (1)
+            #> Do something before
             result = func(*args, **kwargs)
             print("Do something after")  # (2)
+            #> Do something after
             return result
 
         return wrapper
 
 
     @decorator
     def say_hello():
+        #> Hello!
         print("Hello!")
 
 

diff --git a/docs/blog/posts/chat-with-your-pdf-with-gemini.md b/docs/blog/posts/chat-with-your-pdf-with-gemini.md
@@ -55,10 +55,12 @@ client = instructor.from_gemini(
     )
 )
 
+
 # Define your output structure
 class Summary(BaseModel):
     summary: str
 
+
 # Upload the PDF
 file = genai.upload_file("path/to/your.pdf")
 

diff --git a/docs/blog/posts/distilation-part1.md b/docs/blog/posts/distilation-part1.md
@@ -73,16 +73,16 @@ for _ in range(10):
     a = random.randint(100, 999)
     b = random.randint(100, 999)
     print(fn(a, b))
-    #> a=873 b=234 result=204282
-    #> a=902 b=203 result=183106
-    #> a=962 b=284 result=273208
-    #> a=491 b=739 result=362849
-    #> a=193 b=400 result=77200
-    #> a=300 b=448 result=134400
-    #> a=952 b=528 result=502656
-    #> a=574 b=797 result=457478
-    #> a=482 b=204 result=98328
-    #> a=781 b=278 result=217118
+    #> a=444 b=204 result=90576
+    #> a=194 b=489 result=94866
+    #> a=199 b=467 result=92933
+    #> a=967 b=452 result=437084
+    #> a=718 b=370 result=265660
+    #> a=926 b=144 result=133344
+    #> a=847 b=570 result=482790
+    #> a=649 b=227 result=147323
+    #> a=487 b=180 result=87660
+    #> a=665 b=400 result=266000
 ```
 
 ## The Intricacies of Fine-tuning Language Models