diff --git a/.github/workflows/ai-label.yml b/.github/workflows/ai-label.yml
index e8c868090..86651cea1 100644
--- a/.github/workflows/ai-label.yml
+++ b/.github/workflows/ai-label.yml
@@ -1,20 +1,21 @@
 name: AI Labeler
 
 on:
-issues:
+  issues:
     types: [opened, reopened]
-pull_request:
+  pull_request:
     types: [opened, reopened]
 
 jobs:
-ai-labeler:
+  ai-labeler:
     runs-on: ubuntu-latest
     permissions:
-    contents: read
-    issues: write
-    pull-requests: write
+      contents: read
+      issues: write
+      pull-requests: write
     steps:
-    - uses: actions/checkout@v4
-    - uses: jlowin/ai-labeler@v0.2.0
+      - uses: actions/checkout@v4
+      - uses: jlowin/ai-labeler@v0.4.0
         with:
-        openai-api-key: ${{ secrets.OPENAI_API_KEY }}
\ No newline at end of file
+          include-repo-labels: true
+          openai-api-key: ${{ secrets.OPENAI_API_KEY }}
diff --git a/instructor/client.py b/instructor/client.py
index d50659112..87cf1cb8c 100644
--- a/instructor/client.py
+++ b/instructor/client.py
@@ -13,6 +13,10 @@
     Literal,
     Any,
 )
+from tenacity import (
+    AsyncRetrying,
+    Retrying,
+)
 from collections.abc import Generator, Iterable, Awaitable, AsyncGenerator
 from typing_extensions import Self
 from pydantic import BaseModel
@@ -114,7 +118,7 @@ def create(
         self: AsyncInstructor,
         response_model: type[T],
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,
         context: dict[str, Any] | None = None,  # {{ edit_1 }}
         strict: bool = True,
@@ -126,7 +130,7 @@ def create(
         self: Self,
         response_model: type[T],
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | Retrying = 3,
         validation_context: dict[str, Any] | None = None,
         context: dict[str, Any] | None = None,  # {{ edit_1 }}
         strict: bool = True,
@@ -138,7 +142,7 @@ def create(
         self: AsyncInstructor,
         response_model: None,
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,
         context: dict[str, Any] | None = None,  # {{ edit_1 }}
         strict: bool = True,
@@ -150,7 +154,7 @@ def create(
         self: Self,
         response_model: None,
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | Retrying = 3,
         validation_context: dict[str, Any] | None = None,
         context: dict[str, Any] | None = None,  # {{ edit_1 }}
         strict: bool = True,
@@ -161,7 +165,7 @@ def create(
         self,
         response_model: type[T] | None,
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | Retrying | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -185,7 +189,7 @@ def create_partial(
         self: AsyncInstructor,
         response_model: type[T],
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,
         context: dict[str, Any] | None = None,  # {{ edit_1 }}
         strict: bool = True,
@@ -197,7 +201,7 @@ def create_partial(
         self: Self,
         response_model: type[T],
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | Retrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -208,7 +212,7 @@ def create_partial(
         self,
         response_model: type[T],
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | Retrying | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -235,7 +239,7 @@ def create_iterable(
         self: AsyncInstructor,
         messages: list[ChatCompletionMessageParam],
         response_model: type[T],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -247,7 +251,7 @@ def create_iterable(
         self: Self,
         messages: list[ChatCompletionMessageParam],
         response_model: type[T],
-        max_retries: int = 3,
+        max_retries: int | Retrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -258,7 +262,7 @@ def create_iterable(
         self,
         messages: list[ChatCompletionMessageParam],
         response_model: type[T],
-        max_retries: int = 3,
+        max_retries: int | Retrying | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -284,7 +288,7 @@ def create_with_completion(
         self: AsyncInstructor,
         messages: list[ChatCompletionMessageParam],
         response_model: type[T],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -296,7 +300,7 @@ def create_with_completion(
         self: Self,
         messages: list[ChatCompletionMessageParam],
         response_model: type[T],
-        max_retries: int = 3,
+        max_retries: int | Retrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -307,7 +311,7 @@ def create_with_completion(
         self,
         messages: list[ChatCompletionMessageParam],
         response_model: type[T],
-        max_retries: int = 3,
+        max_retries: int | Retrying | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -373,7 +377,7 @@ async def create(
         self,
         response_model: type[T] | None,
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -395,7 +399,7 @@ async def create_partial(
         self,
         response_model: type[T],
         messages: list[ChatCompletionMessageParam],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -419,7 +423,7 @@ async def create_iterable(
         self,
         messages: list[ChatCompletionMessageParam],
         response_model: type[T],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
@@ -443,7 +447,7 @@ async def create_with_completion(
         self,
         messages: list[ChatCompletionMessageParam],
         response_model: type[T],
-        max_retries: int = 3,
+        max_retries: int | AsyncRetrying = 3,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
         strict: bool = True,
diff --git a/instructor/patch.py b/instructor/patch.py
index 2d1f340e9..b64ad32d5 100644
--- a/instructor/patch.py
+++ b/instructor/patch.py
@@ -22,6 +22,11 @@
 from instructor.mode import Mode
 import logging
 
+from tenacity import (
+    AsyncRetrying,
+    Retrying,
+)
+
 logger = logging.getLogger("instructor")
 
 T_Model = TypeVar("T_Model", bound=BaseModel)
@@ -35,7 +40,7 @@ def __call__(
         response_model: type[T_Model] | None = None,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
-        max_retries: int = 1,
+        max_retries: int | Retrying = 1,
         *args: Any,
         **kwargs: Any,
     ) -> T_Model: ...
@@ -47,7 +52,7 @@ async def __call__(
         response_model: type[T_Model] | None = None,
         validation_context: dict[str, Any] | None = None,  # Deprecate in 2.0
         context: dict[str, Any] | None = None,
-        max_retries: int = 1,
+        max_retries: int | AsyncRetrying = 1,
         *args: Any,
         **kwargs: Any,
     ) -> T_Model: ...
@@ -140,7 +145,7 @@ async def new_create_async(
         response_model: type[T_Model] | None = None,
         validation_context: dict[str, Any] | None = None,
         context: dict[str, Any] | None = None,
-        max_retries: int = 1,
+        max_retries: int | AsyncRetrying = 1,
         strict: bool = True,
         hooks: Hooks | None = None,
         *args: T_ParamSpec.args,
@@ -171,7 +176,7 @@ def new_create_sync(
         response_model: type[T_Model] | None = None,
         validation_context: dict[str, Any] | None = None,
         context: dict[str, Any] | None = None,
-        max_retries: int = 1,
+        max_retries: int | Retrying = 1,
         strict: bool = True,
         hooks: Hooks | None = None,
         *args: T_ParamSpec.args,
diff --git a/instructor/retry.py b/instructor/retry.py
index ffc41194d..205c7843f 100644
--- a/instructor/retry.py
+++ b/instructor/retry.py
@@ -14,7 +14,7 @@
 from instructor.utils import update_total_usage
 from instructor.validators import AsyncValidationError
 from openai.types.chat import ChatCompletion
-from openai.types.completion_usage import CompletionUsage
+from openai.types.completion_usage import CompletionUsage, CompletionTokensDetails, PromptTokensDetails
 from pydantic import BaseModel, ValidationError
 from tenacity import (
     AsyncRetrying,
@@ -71,7 +71,10 @@ def initialize_usage(mode: Mode) -> CompletionUsage | Any:
     Returns:
         CompletionUsage | Any: Initialized usage object.
     """
-    total_usage = CompletionUsage(completion_tokens=0, prompt_tokens=0, total_tokens=0)
+    total_usage = CompletionUsage(completion_tokens=0, prompt_tokens=0, total_tokens=0,
+        completion_tokens_details = CompletionTokensDetails(audio_tokens=0, reasoning_tokens=0),
+        prompt_token_details = PromptTokensDetails(audio_tokens=0, cached_tokens=0)
+    )
     if mode in {Mode.ANTHROPIC_TOOLS, Mode.ANTHROPIC_JSON}:
         from anthropic.types import Usage as AnthropicUsage
 
diff --git a/instructor/utils.py b/instructor/utils.py
index 6bee2b1c6..90f769cbc 100644
--- a/instructor/utils.py
+++ b/instructor/utils.py
@@ -142,6 +142,12 @@ def update_total_usage(
         total_usage.completion_tokens += response_usage.completion_tokens or 0
         total_usage.prompt_tokens += response_usage.prompt_tokens or 0
         total_usage.total_tokens += response_usage.total_tokens or 0
+        if (rtd := response_usage.completion_tokens_details) and (ttd := total_usage.completion_tokens_details):
+            ttd.audio_tokens = (ttd.audio_tokens or 0) + (rtd.audio_tokens or 0)
+            ttd.reasoning_tokens = (ttd.reasoning_tokens or 0) + (rtd.reasoning_tokens or 0)
+        if (rpd := response_usage.prompt_tokens_details) and (tpd := total_usage.prompt_tokens_details):
+            tpd.audio_tokens = (tpd.audio_tokens or 0) + (rpd.audio_tokens or 0)
+            tpd.cached_tokens = (tpd.cached_tokens or 0) + (rpd.cached_tokens or 0)
         response.usage = total_usage  # Replace each response usage with the total usage
         return response