From f808ceb68b92d423a0aea61ff776e74f567f8cd9 Mon Sep 17 00:00:00 2001 From: Hermes Date: Thu, 26 Sep 2024 07:08:16 -0400 Subject: [PATCH] Doc Improvements (#1585) * updated explanations * use COMPANY * new intro * updated overviews * updated quickstart * added comments * updated file structure * updated file structure * feedback updates --------- Co-authored-by: saudsami --- assets/code/open-ai-integration/rtc-py.mdx | 40 ++- .../overview/core-concepts.mdx | 5 +- .../overview/product-overview.mdx | 81 +++-- shared/common/core-concepts/agora-console.mdx | 35 +- .../common/core-concepts/app-certificate.mdx | 16 +- shared/common/core-concepts/app-id.mdx | 18 +- shared/common/core-concepts/channel.mdx | 25 +- shared/common/core-concepts/open-ai-intro.mdx | 28 ++ shared/common/core-concepts/real-time-stt.mdx | 6 +- shared/common/core-concepts/sd-rtn.mdx | 10 +- shared/common/core-concepts/token.mdx | 14 +- shared/common/core-concepts/user-id.mdx | 17 +- shared/common/core-concepts/video-sdk.mdx | 2 +- shared/common/prerequisites/index.mdx | 2 - shared/open-ai-integration/quickstart.mdx | 331 +++++++++++------- .../project-implementation/python.mdx | 5 +- .../get-started-sdk/project-setup/python.mdx | 7 +- .../get-started-sdk/project-test/python.mdx | 6 +- 18 files changed, 392 insertions(+), 256 deletions(-) create mode 100644 shared/common/core-concepts/open-ai-intro.mdx diff --git a/assets/code/open-ai-integration/rtc-py.mdx b/assets/code/open-ai-integration/rtc-py.mdx index f51312f1f..f9eace442 100644 --- a/assets/code/open-ai-integration/rtc-py.mdx +++ b/assets/code/open-ai-integration/rtc-py.mdx @@ -48,8 +48,6 @@ class ChannelEventObserver(IRTCConnectionObserver, IRTCLocalUserObserver, IAudio self.emitter = event_emitter self.audio_stream = AudioStream() - - def emit_event(self, event_name: str, *args): """Helper function to emit events.""" self.emitter.emit(event_name, *args) @@ -57,7 +55,7 @@ class ChannelEventObserver(IRTCConnectionObserver, IRTCLocalUserObserver, IAudio def on_connected( self, agora_rtc_conn: RTCConnection, conn_info: RTCConnInfo, reason ): - logger.info(f"Connected to RTC: {agora_rtc_conn} {conn_info} {reason}") + logger.info(f"Connected to RTC: {agora_rtc_conn} {conn_info} {reason}") self.emit_event("connection_state_changed", agora_rtc_conn, conn_info, reason) def on_disconnected( @@ -132,6 +130,7 @@ class ChannelEventObserver(IRTCConnectionObserver, IRTCLocalUserObserver, IAudio def on_playback_audio_frame_before_mixing( self, agora_local_user: LocalUser, channelId, uid, frame: AudioFrame ): + # Convert the received audio frame to PcmAudioFrame audio_frame = PcmAudioFrame() audio_frame.samples_per_channel = frame.samples_per_channel audio_frame.bytes_per_sample = frame.bytes_per_sample @@ -139,6 +138,7 @@ class ChannelEventObserver(IRTCConnectionObserver, IRTCLocalUserObserver, IAudio audio_frame.sample_rate = SAMPLE_RATE audio_frame.data = frame.buffer + # Add the audio frame to the queue self.loop.call_soon_threadsafe(self.audio_stream.queue.put_nowait, audio_frame) return 0 @@ -155,16 +155,20 @@ class Channel(): self.chat = Chat(self) self.channelId = channelId self.uid = uid + + # Configure RTC connection conn_config = RTCConnConfig( client_role_type=ClientRoleType.CLIENT_ROLE_BROADCASTER, channel_profile=ChannelProfileType.CHANNEL_PROFILE_LIVE_BROADCASTING, ) self.connection = self.rtc.agora_service.create_rtc_connection(conn_config) - self.channel_event_observer = ChannelEventObserver(self.emitter) + # Set up channel event observer + self.channel_event_observer = ChannelEventObserver(self.emitter) self.connection.register_observer(self.channel_event_observer) self.connection.connect("", self.channelId, self.uid) + # Configure local user self.local_user = self.connection.get_local_user() self.local_user.set_playback_audio_frame_before_mixing_parameters( CHANNELS, SAMPLE_RATE @@ -173,6 +177,7 @@ class Channel(): self.local_user.register_audio_frame_observer(self.channel_event_observer) self.local_user.subscribe_all_audio() + # Set up audio track for publishing self.media_node_factory = self.rtc.agora_service.create_media_node_factory() self.audio_pcm_data_sender = ( self.media_node_factory.create_audio_pcm_data_sender() @@ -183,6 +188,7 @@ class Channel(): self.audio_track.set_enabled(1) self.local_user.publish_audio(self.audio_track) + # Create data stream for messaging self.stream_id = self.connection.create_data_stream(False, False) self.received_chunks = {} self.waiting_message = None @@ -214,10 +220,11 @@ class Channel(): async def push_audio_frame(self, frame: bytes) -> None: """ Pushes an audio frame to the channel. - + Parameters: frame: The audio frame to push. """ + # Create a PcmAudioFrame from the input bytes audio_frame = PcmAudioFrame() audio_frame.data = bytearray(frame) audio_frame.timestamp = 0 @@ -228,6 +235,7 @@ class Channel(): len(frame) / audio_frame.bytes_per_sample / audio_frame.number_of_channels ) + # Send the audio frame self.audio_pcm_data_sender.send_audio_pcm_data(audio_frame) async def subscribe_audio(self, uid: int) -> None: @@ -299,7 +307,7 @@ class Channel(): def _split_string_into_chunks(self, long_string, msg_id, chunk_size=300) -> list[dict[str: Any]]: """ Splits a long string into chunks of a given size. - + Parameters: long_string: The string to split. msg_id: The message ID. @@ -307,7 +315,7 @@ class Channel(): Returns: list[dict[str: Any]]: The list of chunks. - + """ total_parts = (len(long_string) + chunk_size - 1) // chunk_size json_chunks = [] @@ -321,10 +329,10 @@ class Channel(): 'content': long_string[start:end] } json_chunk = json.dumps(chunk, ensure_ascii=False) - json_chunks.append(json_chunk) + json_chunks.append(json_chunk) return json_chunks - async def send_stream_message(self, data: str, msg_id: str) -> None: + async def send_stream_message(self, data: str, msg_id: str) -> None: """ Sends a stream message to the channel. @@ -333,14 +341,14 @@ class Channel(): msg_id: The message ID. """ - chunks = self._split_string_into_chunks(data, msg_id) + chunks = self._split_string_into_chunks(data, msg_id) for chunk in chunks: self.connection.send_stream_message(self.stream_id, chunk) def on(self, event_name: str, callback): """ Allows external components to subscribe to events. - + Parameters: event_name: The name of the event to subscribe to. callback: The callback to call when the event is emitted. @@ -351,7 +359,7 @@ class Channel(): def once(self, event_name: str, callback): """ Allows external components to subscribe to events once. - + Parameters: event_name: The name of the event to subscribe to. callback: The callback to call when the event is emitted. @@ -386,6 +394,7 @@ class Chat(): "unhandled exception", exc_info=t.exception(), ) + # Start processing messages asyncio.create_task(self._process_message()).add_done_callback(log_exception) async def send_message(self, item: ChatMessage) -> None: @@ -402,7 +411,6 @@ class Chat(): """ Processes messages in the queue. """ - while True: item: ChatMessage = await self.queue.get() await self.channel.send_stream_message(item.message, item.msg_id) @@ -436,13 +444,12 @@ class RtcEngine: Returns: Channel: The channel. """ - loop = asyncio.get_event_loop() future = loop.create_future() def callback(agora_rtc_conn: RTCConnection, conn_info: RTCConnInfo, reason): channel.off("connection_state_changed", callback) - if conn_info.state == 3: + if conn_info.state == 3: # 3 indicates a successful connection future.set_result(channel) else: future.set_exception( @@ -460,4 +467,5 @@ class RtcEngine: Destroys the RTC engine. """ self.agora_service.release()`} - \ No newline at end of file + + diff --git a/open-ai-integration/overview/core-concepts.mdx b/open-ai-integration/overview/core-concepts.mdx index be22c1aa9..47f20a7c9 100644 --- a/open-ai-integration/overview/core-concepts.mdx +++ b/open-ai-integration/overview/core-concepts.mdx @@ -7,9 +7,8 @@ description: > Ideas that are central to developing with Agora. --- -import CoreConcepts from '@docs/shared/common/core-concepts/real-time-stt.mdx'; +import CoreConcepts from '@docs/shared/common/core-concepts/open-ai-intro.mdx'; export const toc = [{}]; - - + diff --git a/open-ai-integration/overview/product-overview.mdx b/open-ai-integration/overview/product-overview.mdx index 8fcd6b56c..6115ab28c 100644 --- a/open-ai-integration/overview/product-overview.mdx +++ b/open-ai-integration/overview/product-overview.mdx @@ -7,44 +7,49 @@ description: > --- -Integrating Agora's real-time audio communication with OpenAI's Large Language Models (LLM) opens the door to powerful, interactive voice-based applications. Create seamless voice-enabled experiences, such as voice-controlled AI assistants, or interactive dialogue systems by combining Agora's robust real-time audio streaming capabilities with the conversational intelligence of OpenAI's LLMs. This integration allows for dynamic, responsive audio interactions, enhancing user engagement across a wide range of use cases—from customer support bots to collaborative voice-driven applications. +Integrating Agora’s real-time audio communication with OpenAI’s Large Language Models (LLMs) unlocks the potential for powerful, interactive voice-based applications. By combining Agora’s robust real-time audio streaming capabilities with the conversational intelligence of OpenAI’s LLMs, you can create seamless voice-enabled experiences, such as voice-powered AI assistants or interactive dialogue systems. This integration enables dynamic, responsive audio interactions, enhancing user engagement across a broad range of use cases—from customer support bots to collaborative voice-driven applications. - \ No newline at end of file +Most importantly, by combining the strengths of Agora and OpenAI, this integration enables the most natural form of language interaction, lowering the barrier for users to harness the power of AI and making advanced technologies more accessible than ever before. + diff --git a/shared/common/core-concepts/agora-console.mdx b/shared/common/core-concepts/agora-console.mdx index c5829e05e..0ef193e2c 100644 --- a/shared/common/core-concepts/agora-console.mdx +++ b/shared/common/core-concepts/agora-console.mdx @@ -1,25 +1,28 @@ - -To use SDKs, create an audio and video project in the first. See [Agora account management](../get-started/manage-agora-account) for details. + + is the main dashboard where you manage your projects and services. Before you can use 's SDKs, you must first create a project in the . See [Agora account management](../get-started/manage-agora-account) for + details. - -To use , create a project in the first. See [Agora account management](../reference/manage-agora-account) for details. + +To use , create a project in the first. - -![Create project in Agora Console](/images/common/create-project.svg) - +![Create project in Agora Console](/images/common/create-project.svg) #### - is the main dashboard where you manage your projects and services. provides an intuitive interface for developers to query and manage their account. After registering an Agora Account, you use the to perform the following tasks: + provides an intuitive interface for developers to query and manage their account. After registering an Agora Account, you use the to perform the following tasks: + +- Manage the account +- Create and configure projects and services +- Get an App ID +- Manage members and roles +- Check call quality and usage +- Check bills and make payments +- Access product resources + + also provides RESTful APIs that you use to implement features such as creating a project and fetching usage numbers programmatically. -- Manage the account -- Create and configure projects and services -- Get an App ID -- Manage members and roles -- Check call quality and usage -- Check bills and make payments -- Access product resources +#### Account Management - also provides RESTful APIs that you use to implement features such as creating a project and fetching usage numbers programmatically. \ No newline at end of file +See [Agora account management](../reference/manage-agora-account) for details on how to manage all aspects of your account. diff --git a/shared/common/core-concepts/app-certificate.mdx b/shared/common/core-concepts/app-certificate.mdx index b200b7a4c..3be9d5353 100644 --- a/shared/common/core-concepts/app-certificate.mdx +++ b/shared/common/core-concepts/app-certificate.mdx @@ -1,15 +1,5 @@ -#### App certificate +#### App Certificate -An App certificate is a string generated by to enable token authentication. It is required for generating a or authentication token. +An App Certificate is a unique key generated by the to secure projects through token authentication. It is required, along with the App ID, to generate a token that proves authorization between your systems and 's network. App Certificates are used to generate or authentication tokens. - - -To use your App certificate for setting up a token server, see [Create and run a token server.](../get-started/authentication-workflow) - - - - -To use your App certificate for setting up a token server, see [Create and run a token server.](../get-started/authentication-workflow) - - \ No newline at end of file +App Certificates should be stored securely in your backend systems. If your App Certificate is compromised or to meet security compliance requirements, you can invalidate certificates and create new ones through the . diff --git a/shared/common/core-concepts/app-id.mdx b/shared/common/core-concepts/app-id.mdx index eeef716ef..af9b69fa4 100644 --- a/shared/common/core-concepts/app-id.mdx +++ b/shared/common/core-concepts/app-id.mdx @@ -1,24 +1,26 @@ #### App ID -The App ID is a random string generated within when you create a new project. You can create multiple projects in your account; each project has a different App ID. This App ID enables your app users to communicate securely with each other. When you initialize in your app, you pass the App ID as an argument. The App ID is also used to create the authentication tokens that ensure secure communication in a channel. You retrieve your App ID using . +The App ID is a unique key generated by 's platform to identify each project. Each project in your account is assigned its own unique App ID. The App ID is critical for connecting users within your app. It's used to initialize the in your app, and as one of the required keys to create authentication tokens for secure communication. Retrieve your App ID using the . - uses this App ID to identify each app, provide billing and other statistical data services. + uses the App ID to identify each app and provide billing and other statistical data services. + +App IDs are stored on the front-end client and do not provide access control. Projects using only an App ID allow any user with the App ID to join voice and video streams. + "interactive-live-streaming", "broadcast-streaming","signaling","open-ai-integration"]}> -For applications requiring high security in a production environment, you must choose an **App ID + Token** mechanism for [user authentication](../get-started/authentication-workflow) when creating a new project. Without an authentication token, your environment is open to anyone with access to your App ID. +For applications requiring access controls, such as those in production environments, choose an **App ID + Token** mechanism for [user authentication](../get-started/authentication-workflow) when creating a new project. Without an authentication token, your environment is open to anyone with access to your App ID. - + -For applications requiring high security in a production environment, you must choose a **App ID + Token** mechanism for user authentication when creating a new project. Without an authentication token, your environment is open to anyone with access to your App ID. +For applications requiring access controls, such as those in production environments, choose an **App ID + Token** mechanism for user authentication when creating a new project. Without an authentication token, your environment is open to anyone with your App ID. -For applications requiring high security in a production environment, you must choose a **App ID + Token** mechanism for [user authentication](../get-started/authentication-workflow) when creating a new project. Without an authentication token, your environment is open to anyone with access to your App ID. +For applications requiring access controls, such as those in production environments, choose an **App ID + Token** mechanism for [user authentication](../get-started/authentication-workflow) when creating a new project. Without an authentication token, your environment is open to anyone with your App ID. - \ No newline at end of file + diff --git a/shared/common/core-concepts/channel.mdx b/shared/common/core-concepts/channel.mdx index dd82b8bdc..585721d44 100644 --- a/shared/common/core-concepts/channel.mdx +++ b/shared/common/core-concepts/channel.mdx @@ -1,24 +1,29 @@ #### Channel +In 's platform, a channel is a way of grouping users together and is identified by a unique _channel name_. Users who connect to the same channel can communicate with each other. A channel is created when the first user joins and ceases to exist when the last user leaves. + -A data transfer management mechanism for passing data from one device to another. Any user who subscribes to or joins a channel can receive messages or events transmitted in that channel. Clients can subscribe to or join multiple channels at the same time. +In , channels serve as a data transfer management mechanism for passing data between devices. Clients can subscribe to or join multiple channels simultaneously. - includes the following types of channels: + supports the following channel types: -| Channel type | Main features | Applicable scenario| -|---------------|---------------|--------------------| -| Message | Follows the industry-standard pub/sub model. Channels do not need to be created in advance, and there is no upper limit on the number of publishers and subscribers in a channel.|Multi-device management and command exchange in the IoT industry, location tracking in smart devices, etc.| -| Stream |Follows the chat room model. Users need to join the channel to send and receive event notifications. Messages are managed and delivered through topics, and a single channel allows up to 1000 users to join at the same time. Supports channel sharing and synchronous transmission of audio and video data.|High-frequency and large concurrent data transmission or co-channel and synchronous transmission with audio and video data, such as metaverse, cloud games, etc.| +| Channel Type | Main Features | Applicable Scenarios | +| ------------ | ------------------- | --------------------------------- | +| Message | Follows the industry-standard pub/sub model. Channels do not need to be created in advance, and there is no upper limit on the number of publishers and subscribers in a channel. | Multi-device management and command exchange in the IoT industry, location tracking in smart devices, etc. | +| Stream | Follows the chat room model. Users need to join the channel to send and receive event notifications. Messages are managed and delivered through topics, and a single channel allows up to 1,000 users to join simultaneously. Supports channel sharing and synchronous transmission of audio and video data. | High-frequency and large concurrent data transmission or co-channel and synchronous transmission with audio and video data, such as in metaverse and cloud gaming applications. | - uses the *channel name* to identify a channel. Users who specify the same *channel name* join a common channel and interact with each other. A channel is created when the first user joins. It ceases to exist when the last user leaves. +Channels are created by calling the methods for transmitting real-time data. uses different channels to transmit different types of data: + +- The channel is used for transmitting audio or video data. +- The channel is used for transmitting messaging or signaling data. -You create a channel by calling the methods for transmitting real-time data. uses different channels to transmit different types of data. The channel transmits audio or video data, while the channel transmits messaging or signaling data. The and channels are independent of each other. +These channels are independent of each other. -Additional components provided by , such as On-Premise Recording and Cloud Recording, join the channel and provide real-time recording, transmission acceleration, media playback, and content moderation. +Additional services provided by , such as Cloud Recording and Real-Time Speech-To-Text, join the channel to provide real-time recording, transmission acceleration, media playback, and content moderation. - \ No newline at end of file + diff --git a/shared/common/core-concepts/open-ai-intro.mdx b/shared/common/core-concepts/open-ai-intro.mdx new file mode 100644 index 000000000..8f8af22bf --- /dev/null +++ b/shared/common/core-concepts/open-ai-intro.mdx @@ -0,0 +1,28 @@ +import Console from './agora-console.mdx'; +import AppId from './app-id.mdx'; +import AppCertificate from './app-certificate.mdx'; +import Token from './token.mdx'; +import Channel from './channel.mdx'; +import UserId from './user-id.mdx'; +import SD_RTN from './sd-rtn.mdx'; + +Combining Agora’s real-time audio communication with OpenAI’s Large Language Models (LLMs) opens up new possibilities for creating powerful, interactive voice-driven applications. + +This guide introduces the key processes and concepts you need to know to use 's platform effectively. + +## Using the + + + +## General Concepts + + + + + + + + +## RESTful APIs + + offers RESTful APIs across many of its products. For details, see [RESTful API](/video-calling/channel-management-api/overview). diff --git a/shared/common/core-concepts/real-time-stt.mdx b/shared/common/core-concepts/real-time-stt.mdx index f385e737e..92eb320a0 100644 --- a/shared/common/core-concepts/real-time-stt.mdx +++ b/shared/common/core-concepts/real-time-stt.mdx @@ -6,14 +6,16 @@ import Channel from './channel.mdx'; import UserId from './user-id.mdx'; import SD_RTN from './sd-rtn.mdx'; -Agora's enables you to transcribe audio of each host to provide live closed captions (CC) and transcription for improved accessibility. +'s enables you to transcribe audio of each host to provide live closed captions (CC) and transcription for improved accessibility. -This article introduces the key processes and concepts you need to know to use . +This guide introduces the key processes and concepts you need to know to use . ## Using the + ## General concepts + uses the following basic concepts: diff --git a/shared/common/core-concepts/sd-rtn.mdx b/shared/common/core-concepts/sd-rtn.mdx index 023aafaa5..c353d810d 100644 --- a/shared/common/core-concepts/sd-rtn.mdx +++ b/shared/common/core-concepts/sd-rtn.mdx @@ -1,9 +1,9 @@ #### -Agora's core engagement services are powered by its Software-Defined Real-time Network (SD-RTN™) that is accessible and available anytime, anywhere around the world. The software-defined network isn’t confined by device, phone numbers, or a telecommunication provider’s coverage area like traditional networks. has data centers globally that cover over 200+ countries and regions. The network delivers sub-second latency and high availability of real-time video and audio anywhere on the globe. With , Agora can deliver live user engagement experiences in the form of real-time communication (RTC) with the following advantages: +'s core engagement services are powered by its Software-Defined Real-time Network (SD-RTN™), which is accessible and available anytime, anywhere around the world. Unlike traditional networks, the software-defined network is not confined by device, phone numbers, or a telecommunication provider's coverage area. has data centers globally, covering over 200 countries and regions. The network delivers sub-second latency and high availability of real-time video and audio anywhere on the globe. With , can deliver live user engagement experiences in the form of real-time communication (RTC) with the following advantages: -* Unmatched quality of service -* High availability and accessibility -* True scalability -* Low Cost +- Unmatched quality of service +- High availability and accessibility +- True scalability +- Low cost diff --git a/shared/common/core-concepts/token.mdx b/shared/common/core-concepts/token.mdx index c9951abb6..b60bcc13d 100644 --- a/shared/common/core-concepts/token.mdx +++ b/shared/common/core-concepts/token.mdx @@ -1,18 +1,20 @@ -#### Token +#### Tokens -A token is a dynamic key that is used by the authentication server to check user permissions. You use to generate a temporary token for testing purposes during the development process. In a production environment, you implement a token server in your security infrastructure to control access to your channels. +A token is a dynamic key generated using the App ID, App Certificate, user ID, and expiration timestamp. Tokens authenticate and secure access to 's services, ensuring only authorized users can join a channel and participate in real-time communication. + +Tokens are generated on your server and passed to the client for use in the or . The token generation process involves digitally signing the App ID, App Certificate, user ID, and expiration timestamp using a specific algorithm, preventing tampering or forgery. + +For testing and during development, use the to generate temporary tokens. For production environments, implement a token server as part of your security infrastructure to control access to your channels. -For more information, see [Secure authentication with tokens](../get-started/authentication-workflow). +For information on setting up a token server for generating and managing tokens, refer to the guide on [Secure authentication with tokens](/video-calling/get-started/authentication-workflow). -For more information, see [Secure authentication with tokens](../get-started/authentication-workflow). +For information on setting up a token server for generating and managing tokens, refer to the guide on [Secure authentication with tokens](../get-started/authentication-workflow). - -After obtaining the App ID, App Certificate, and Token in the , you can start implementing basic audio and video communication in your app. \ No newline at end of file diff --git a/shared/common/core-concepts/user-id.mdx b/shared/common/core-concepts/user-id.mdx index ded23f471..1eb667457 100644 --- a/shared/common/core-concepts/user-id.mdx +++ b/shared/common/core-concepts/user-id.mdx @@ -1,16 +1,19 @@ #### User ID -A User ID (UID) identifies a user in a channel. Each user in a channel should have a unique user ID. If you do not specify a user ID when the user joins a channel, a UID is automatically generated and assigned to the user. + +In 's platform, the UID is an integer value that is a unique identifier assigned to each user within the context of a specific channel. When joining a channel, you have the choice to either assign a specific UID to the user or pass `0` or `null` and allow 's platform to automatically generate and assign a UID for the user. If two users attempt to join the same channel with the same UID, it can lead to unexpected behavior. + +The UID is used by 's services and components to identify and manage users within a channel. Developers should ensure that UIDs are properly assigned to prevent conflicts. + -A user ID (UID) identifies a user in a channel. A user is a person or entity that logs into your . -Each user in a [project](#app-id) must have a globally unique UID. -The same UID cannot log in to from multiple devices at the same time. If s with the same UID logs in to , the that logged in first -is disconnected and sent an event notification. +In , the UID is a string that is a unique identifier and required along with an App ID to initialize the SDK. It is used to identify the user when logging in to and throughout their session. Users can join channels by providing just the channel name, as the UID is already associated with the user during initialization. + +The same UID cannot log in to from multiple devices at the same time. If s with the same UID logs in to , the previously logged in client is disconnected and sent an event notification. The UID is used for billing and online status notifications. - \ No newline at end of file + + diff --git a/shared/common/core-concepts/video-sdk.mdx b/shared/common/core-concepts/video-sdk.mdx index 8db497ab3..b11931002 100644 --- a/shared/common/core-concepts/video-sdk.mdx +++ b/shared/common/core-concepts/video-sdk.mdx @@ -17,7 +17,7 @@ RTC (Real-Time Communication) refers to real-time communication technology, whic SDKs provide real-time audio and video interaction services, with multi-platform and multi-device support. This includes high-definition video calls, voice-only calls, interactive live streaming, as well as one-on-one and multi-group chats. -This article introduces the key processes and concepts you need to know to use SDKs. +This guide introduces the key processes and concepts you need to know to use SDKs. ## Using the diff --git a/shared/common/prerequisites/index.mdx b/shared/common/prerequisites/index.mdx index 4c4c41e8a..b151313d6 100644 --- a/shared/common/prerequisites/index.mdx +++ b/shared/common/prerequisites/index.mdx @@ -1,7 +1,6 @@ import Android from './android.mdx'; import Ios from './ios.mdx'; import MacOS from './macos.mdx'; -import Python from './python.mdx'; import Web from './web.mdx'; import ReactNative from './react-native.mdx'; import ReactJS from './react-js.mdx'; @@ -15,7 +14,6 @@ import Unreal from './unreal.mdx'; - diff --git a/shared/open-ai-integration/quickstart.mdx b/shared/open-ai-integration/quickstart.mdx index ada8d65bb..ea4a0e671 100644 --- a/shared/open-ai-integration/quickstart.mdx +++ b/shared/open-ai-integration/quickstart.mdx @@ -1,12 +1,12 @@ import CodeBlock from '@theme/CodeBlock'; -import CodeRtcPy from '@docs/assets/code/open-ai-integration/rtc-py.mdx' -import Prerequisites from '@docs/shared/common/prerequisites/index.mdx'; +import CodeRtcPy from '@docs/assets/code/open-ai-integration/rtc-py.mdx'; +import Prerequisites from '@docs/shared/common/prerequisites/python.mdx'; -Integrating Agora's real-time audio communication capabilities with OpenAI's language models enables dynamic, conversational AI experiences. This guide shows you how to set up a Python project that combines Agora's voice SDK with OpenAI's API, creating an interactive, voice-driven assistant. +Integrating Agora’s real-time audio communication capabilities with OpenAI’s language models enables dynamic, conversational AI experiences. This guide shows you how to set up a Python project that combines Agora’s server-side Voice SDK with OpenAI’s API to create an interactive, voice-driven assistant. ## Understand the tech -The `RealtimeKitAgent` class manages the integration by connecting to an Agora channel for real-time audio streaming and to OpenAI's API for processing audio input and generating AI-driven responses. Audio frames captured from an Agora channel are streamed to OpenAI's API where the AI processes the input. The API responses, which include transcribed text and synthesized voice output, are then delivered back to the Agora channel. +The `RealtimeKitAgent` class manages the integration by connecting to an Agora channel for real-time audio streaming and to OpenAI's API for processing audio input and generating AI-driven responses. Audio frames captured from the Agora channel are streamed to OpenAI's API, where the AI processes the input. The API responses, which include transcribed text and synthesized voice output, are then delivered back to the Agora channel. The code sets up tools that can be executed locally or passed through the API. This allows the AI to perform specific tasks, such as retrieving data from external sources. The agent processes various message types from OpenAI, such as audio responses, transcription updates, and error messages, and sends them to users through the Agora audio channel, facilitating continuous interaction. @@ -14,70 +14,80 @@ The code sets up tools that can be executed locally or passed through the API. T -## Set up your project +## Set up the project Follow these steps to set up your Python integration project: -1. Download the OpenAI [`realtimeapi-examples`](https://openai.com/api/) package and unzip it. +1. Create a new folder for the project. -1. Create the following folder structure for your project: + ```bash + mkdir realtime-agent + cd realtime-agent/ + + ``` + +1. Create the following structure for your project: ``` /realtime-agent - │ - ├── agent.py - ├── .env - ├── requirements.txt - │ - ├── agora/ - │ ├── __init__.py - │ ├── rtc.py - │ - ├── realtimekit/ - │ ├── __init__.py - │ ├── realtimeapi/ - │ ├── __init__.py - │ ├── client.py - │ ├── messages.py - │ └── util.py + ├── __init__.py + ├── .env + ├── agent.py + ├── agora + │   ├── __init__.py + │   ├── requirements.txt + │   └── rtc.py + └── realtimeapi + ├── __init__.py + ├── client.py + ├── messages.py + └── util.py ``` - - `agent.py`: This is he main script that runs the `RealtimeKitAgent`. - - It imports Agora functionality from the `agora/rtc.py` module and the OpenAI capabilities from the `realtimekit/realtimeapi` package. - - `agora/rtc.py`: Contains the wrapper around the Agora Python SDK. - - `realtimekit/realtimeapi/`: Contains the classes and methods that interact with OpenAI’s Realtime API. + + This project uses the OpenAI [`realtimeapi-examples`](https://openai.com/api/) package.Download the project and unzip it into your `realtime-agent` folder. + - The [Complete code](#complete-integration-code) code for `agent.py` and `rtc.py` is provided on this page. The files in the `realtimekit/realtimeapi` folder are copied from the downloaded OpenAI package. + The following descriptions provide an overview of the key files in the project: -1. Add the following keys to your `.env` file: + - `agent.py`: The primary script responsible for executing the `RealtimeKitAgent`. It integrates Agora's functionality from the `agora/rtc.py` module and OpenAI's capabilities from the `realtimeapi` package. + - `agora/rtc.py`: Contains an implementation of the server-side Agora Python Voice SDK. + - `realtimeapi/`: Contains the classes and methods that interact with OpenAI’s Realtime API. - ```python - # Agora RTC app ID - AGORA_APP_ID=your_agora_app_id + The [Complete code](#complete-integration-code) for `agent.py` and `rtc.py` is provided at the bottom of this page. - # OpenAI API key for authentication - OPENAI_API_KEY=your_openai_api_key_here +1. Open your `.env` file and add the following keys: - # API base URI for the Realtime API - REALTIME_API_BASE_URI=wss://api.openai.com - ``` + ```python + # Agora RTC app ID + AGORA_APP_ID=your_agora_app_id + + # OpenAI API key for authentication + OPENAI_API_KEY=your_openai_api_key_here + + # API base URI for the Realtime API + REALTIME_API_BASE_URI=wss://api.openai.com + ``` 1. Install the dependencies: - ```bash - pip install -r requirements.txt - ``` + ```bash + pip install -r requirements.txt + ``` ## Implementation -The `RealtimeKitAgent` class integrates Agora's audio communication capabilities with OpenAI's AI services. This class manages audio streams, handles communication with the OpenAI API, and processes AI-generated responses, providing a seamless conversational AI experience. +The `RealtimeKitAgent` class integrates Agora's audio communication capabilities with OpenAI's AI services. This class manages audio streams, handles communication with the OpenAI API, and processes AI-generated responses, providing a seamless conversational AI experience. ### Connect to Agora and OpenAI -The `setup_and_run_agent` method sets up the `RealtimeKitAgent` by connecting to an Agora channel and initializing a session with the OpenAI Realtime API client. It sends configuration messages to set up the session and conversation parameters before starting the agent's operations. The method ensures the connection is properly handled and cleaned up after use. +The `setup_and_run_agent` method sets up the `RealtimeKitAgent` by connecting to an Agora channel using the provided `RtcEngine` and initializing a session with the OpenAI Realtime API client. It sends configuration messages to set up the session and define conversation parameters, such as the system message and output audio format, before starting the agent's operations. The method uses asynchronous execution to handle both listening for the session start and sending conversation configuration updates concurrently. It ensures that the connection is properly managed and cleaned up after use, even in cases of exceptions, early exits, or shutdowns. + + +UIDs in the Python SDK are set using a string value. Agora recommends using only numerical values for UID strings to ensure compatibility with all Agora products and extensions. + -``` python +```python @classmethod async def setup_and_run_agent( cls, @@ -86,24 +96,24 @@ async def setup_and_run_agent( inference_config: InferenceConfig, tools: ToolContext | None, ) -> None: + # Connect to a channel using the provided RtcEngine channel = await engine.connect(channelId="realtimekit_agora", uid="123") try: + # Create and enter a context manager for the RealtimeApiClient async with RealtimeApiClient( base_uri=os.getenv("REALTIME_API_BASE_URI", "wss://api.openai.com"), api_key=os.getenv("OPENAI_API_KEY"), verbose=False, ) as client: + # Send a message to update the session configuration await client.send_message( messages.UpdateSessionConfig( session=messages.SessionResource(), - # turn_detection=inference_config.turn_detection, - # transcribe_input=False, - # input_audio_format=messages.AudioFormats.PCM16, - # vads=messages.VADConfig(), ) ) + # Concurrently wait for the start session message and send the conversation config [start_session_message, _] = await asyncio.gather( *[ anext(client.listen()), @@ -118,26 +128,34 @@ async def setup_and_run_agent( ), ] ) + + # Ensure the received message is of the correct type assert isinstance(start_session_message, messages.StartSession) + + # Print session information print( f"Session started: {start_session_message.session.id} model: {start_session_message.session.model}" ) + # Create an instance of the agent agent = cls( client=client, tools=tools, channel=channel, ) + + # Run the agent await agent.run() finally: + # Ensure disconnection and shutdown occur, even if an exception is raised await engine.disconnect() await shutdown(asyncio.get_event_loop()) ``` ### Initialize the RealtimeKitAgent -The `RealtimeKitAgent` class constructor accepts an OpenAI `RealtimeApiClient`, an optional `ToolContext` for function registration, and an Agora Channel for audio communication. This setup prepares the agent for processing audio streams and interacting with the AI model. +The `RealtimeKitAgent` class constructor accepts an OpenAI `RealtimeApiClient`, an optional `ToolContext` for function registration, and an Agora channel for managing audio communication. This setup initializes the agent to process audio streams, register tools (if provided), and interacts with the AI model. ```python def __init__( @@ -153,132 +171,161 @@ def __init__( self.channel = channel ``` -### Launch the agent +### Launch the Agent -The `entry_point` method is the primary entry point for launching the agent. It invokes `setup_and_run_agent` with the relevant parameters, initializing the agent and triggering its functionalities. +The `entry_point` method serves as the primary entry for launching the agent. It calls `setup_and_run_agent` with the necessary parameters, initializing the agent and activating its core functionalities. ```python @classmethod async def entry_point( cls, *, - engine: RtcEngine, - inference_config: InferenceConfig, - tools: ToolContext | None = None, + engine: RtcEngine, # The Agora RTC engine instance for audio streaming + inference_config: InferenceConfig, # Configuration for the AI inference (e.g., system message, voice) + tools: ToolContext | None = None, # Optional tool context for registering functions ) -> None: + # Call the method to set up and run the agent, passing in the necessary parameters await cls.setup_and_run_agent( engine=engine, inference_config=inference_config, tools=tools ) ``` -The asynchronous `run` method orchestrates the main operations of the `RealtimeKitAgent`. It handles audio streaming, manages tasks for processing audio input, output, and model messages, and sets up exception handling. +The asynchronous `run` method orchestrates the main operations of the `RealtimeKitAgent`. It manages audio streaming, processes tasks related to audio input, output, and model messages, and ensures exception handling is in place. ```python async def run(self) -> None: + # Log unhandled exceptions that occur in tasks def log_exception(t: asyncio.Task[Any]) -> None: if not t.cancelled() and t.exception(): logger.error( - "unhandled exception", + "Unhandled exception", exc_info=t.exception(), ) + # Future used to detect when the agent is disconnected disconnected_future = asyncio.Future[None]() + # Set the result for the disconnected future when the agent is disconnected def _on_disconnected() -> None: if not disconnected_future.done(): disconnected_future.set_result(None) + # Event listener for disconnection (commented out for now) # self.room.on("disconnected", _on_disconnected) + # Start streaming audio input to the AI model, with exception logging asyncio.create_task(self._stream_input_audio_to_model()).add_done_callback( log_exception ) + + # Start streaming audio output (synthesized responses) back to the users, with exception logging asyncio.create_task( self._stream_audio_queue_to_audio_output() ).add_done_callback(log_exception) + # Start processing model messages (e.g., transcriptions, updates), with exception logging asyncio.create_task(self._process_model_messages()).add_done_callback( log_exception ) + # Wait until the disconnection future is resolved, meaning the agent has disconnected await disconnected_future - logger.info("Agent finished running") + logger.info("Agent finished running") # Log that the agent has completed its operation ``` ### Stream input audio to the AI model -The asynchronous method `_stream_input_audio_to_model` captures audio frames from the Agora channel and sends them to the OpenAI API client for processing. It listens for incoming audio frames and forwards them for real-time audio analysis by the AI model. +The asynchronous method `_stream_input_audio_to_model` captures audio frames from the Agora channel and sends them to the OpenAI API client for real-time processing by the AI model. ```python async def _stream_input_audio_to_model(self) -> None: + # Retrieve audio frames from the Agora channel audio_frames = self.channel.get_audio_frames() + + # Loop through each audio frame received from the channel async for audio_frame in audio_frames: - # send the frame to the model via the API client + # Send the audio frame's data to the AI model via the OpenAI API client await self.client.send_audio_data(audio_frame.data) ``` ### Stream audio from the AI model to the user -The asynchronous method `_stream_audio_queue_to_audio_output` manages the transmission of processed audio data from the AI model back to the end-user. It retrieves audio frames from a queue and sends them to the Agora channel, allowing users to hear the AI-generated responses in real-time. +The asynchronous method `_stream_audio_queue_to_audio_output` handles the playback of processed audio data from the AI model. It retrieves audio frames from a queue and sends them to the Agora channel, allowing users to hear AI-generated responses in real-time. ```python async def _stream_audio_queue_to_audio_output(self) -> None: while True: - # audio queue contains audio data from the model, send it the end-user via our local audio source + # Retrieve the next processed audio frame from the queue (AI model's response) frame = await self.audio_queue.get() + + # Send the audio frame to the Agora channel for playback to the user await self.channel.push_audio_frame(frame) - await asyncio.sleep(0) # allow other tasks to run + + # Yield control to allow other tasks to run, improving responsiveness + await asyncio.sleep(0) ``` -The `_process_model_messages` asynchronous method listens for incoming messages from the OpenAI API client and processes them based on their type. It handles various message types, such as audio deltas, transcriptions, and errors, ensuring appropriate actions for each. This includes updating the user chat with transcribed text and managing audio playback. +The asynchronous method `_process_model_messages` listens for messages from the OpenAI API client and processes them based on their type. It handles a variety of message types, including audio deltas, transcriptions, and errors. The method updates the user chat with transcribed text, queues audio for playback, and manages other session-related events, such as tool calls and generation states. ```python async def _process_model_messages(self) -> None: + # Listen for incoming messages from the OpenAI API client async for message in self.client.listen(): + # Process each type of message received from the client match message: - case messages.ResonseAudioDelta(): - # logger.info("Received audio message") + case messages.ResponseAudioDelta(): + # Handle audio response deltas by decoding and adding them to the audio queue await self.audio_queue.put(base64.b64decode(message.delta)) - case messages.ResonseAudioTranscriptionDelta(): + case messages.ResponseAudioTranscriptionDelta(): + # Log and send transcribed text updates to the Agora chat channel logger.info(f"Received text message {message=}") await self.channel.chat.send_message(ChatMessage(message=message.delta, msg_id=message.output_item_id)) - case messages.ResonseAudioTranscriptionDone(): + case messages.ResponseAudioTranscriptionDone(): + # Handle completion of transcription and send the final text message logger.info(f"Text message done: {message=}") await self.channel.chat.send_message(ChatMessage(message=message.value, msg_id=message.output_item_id, done=True)) case messages.MessageAdded(): + # Placeholder for handling other message types (currently not used) pass + case messages.ServerAddMessage(): + # Placeholder for handling server-side messages (currently not used) pass case messages.VADSpeechStarted(): + # Placeholder for handling voice activity detection start pass + case messages.VADSpeechStopped(): + # Placeholder for handling voice activity detection stop pass case messages.GenerationCanceled(): + # Log when a generation process is canceled logger.info(f"Server turn canceled: {message=}") case messages.GenerationFinished(): - # TODO this is where we mark no longer appending text + # Log when the generation process is finished (e.g., no more text appending) logger.info(f"Server turn finished: {message=}") + # TODO: Implement behavior to mark generation completion # await self.channel.generation_finished() case messages.AddContent(type=messages.AddContentType.TOOL_CALL): - # TODO implement streaming tool calls + # TODO: Implement streaming tool calls when a tool call is added to the content logger.info(f"Received tool call buffer add {message=}") case messages.RealtimeError(error=error): - # TODO do we have to stop the session here? + # Log any errors received from the OpenAI client logger.error(f"Received error message {error=}") case _: + # Log any unhandled or unknown message types logger.warning(f"Unhandled message {message=}") ``` - ### Complete integration code The `agent.py` script integrates the code components presented in this section into reusable Python classes that you can extend for your own applications. @@ -304,16 +351,19 @@ from realtimekit.realtimeapi.client import RealtimeApiClient from .agora.rtc import Channel, Chat, ChatMessage, RtcEngine -logger = logging.getLogger(__name__) +# Logger configuration + +logger = logging.getLogger(**name**) +# Data classes for configuration and tool declarations @dataclass(frozen=True, kw_only=True) class InferenceConfig: + """Configuration for the inference process.""" system_message: str | None = None turn_detection: messages.TurnDetectionTypes | None = None voice: messages.Voices | None = None - @dataclass(frozen=True, kw_only=True) class LocalFunctionToolDeclaration: """Declaration of a tool that can be called by the model, and runs a function locally on the tool context.""" @@ -333,7 +383,6 @@ class LocalFunctionToolDeclaration: }, } - @dataclass(frozen=True, kw_only=True) class PassThroughFunctionToolDeclaration: """Declaration of a tool that can be called by the model, and is passed through the LiveKit client.""" @@ -352,28 +401,28 @@ class PassThroughFunctionToolDeclaration: }, } +# Type alias for tool declarations ToolDeclaration = LocalFunctionToolDeclaration | PassThroughFunctionToolDeclaration - @dataclass(frozen=True, kw_only=True) class LocalToolCallExecuted: json_encoded_output: str - @dataclass(frozen=True, kw_only=True) class ShouldPassThroughToolCall: decoded_function_args: dict[str, Any] +# Type alias for tool execution results ExecuteToolCallResult = LocalToolCallExecuted | ShouldPassThroughToolCall - class ToolContext(abc.ABC): + """Abstract base class for managing tool declarations and executions.""" _tool_declarations: dict[str, ToolDeclaration] def __init__(self) -> None: - # TODO should be an ordered dict + # TODO: This should be an ordered dict self._tool_declarations = {} def register_function( @@ -384,6 +433,7 @@ class ToolContext(abc.ABC): parameters: dict[str, Any], fn: Callable[..., Any], ) -> None: + """Register a local function as a tool.""" self._tool_declarations[name] = LocalFunctionToolDeclaration( name=name, description=description, parameters=parameters, function=fn ) @@ -395,6 +445,7 @@ class ToolContext(abc.ABC): description: str = "", parameters: dict[str, Any], ) -> None: + """Register a client function as a tool.""" self._tool_declarations[name] = PassThroughFunctionToolDeclaration( name=name, description=description, parameters=parameters ) @@ -402,6 +453,7 @@ class ToolContext(abc.ABC): async def execute_tool( self, tool_name: str, encoded_function_args: str ) -> ExecuteToolCallResult | None: + """Execute a tool based on its name and provided arguments.""" tool = self._tool_declarations.get(tool_name) if not tool: return None @@ -421,15 +473,15 @@ class ToolContext(abc.ABC): assert_never(tool) def model_description(self) -> list[dict[str, Any]]: + """Generate a description of all registered tools for the model.""" return [v.model_description() for v in self._tool_declarations.values()] - class ClientToolCallResponse(BaseModel): tool_call_id: str result: dict[str, Any] | str | float | int | bool | None = None - class RealtimeKitAgent: + """Main agent class for handling real-time communication and processing.""" engine: RtcEngine channel: Channel client: RealtimeApiClient @@ -448,14 +500,18 @@ class RealtimeKitAgent: inference_config: InferenceConfig, tools: ToolContext | None, ) -> None: + """Set up and run the agent with the provided configuration.""" + # Connect to a channel using the provided RtcEngine channel = await engine.connect(channelId="realtimekit_agora", uid="123") try: + # Create and enter a context manager for the RealtimeApiClient async with RealtimeApiClient( base_uri=os.getenv("REALTIME_API_BASE_URI", "wss://api.openai.com"), api_key=os.getenv("OPENAI_API_KEY"), verbose=False, ) as client: + # Send a message to update the session configuration await client.send_message( messages.UpdateSessionConfig( session=messages.SessionResource(), @@ -466,6 +522,7 @@ class RealtimeKitAgent: ) ) + # Concurrently wait for the start session message and send the conversation config [start_session_message, _] = await asyncio.gather( *[ anext(client.listen()), @@ -480,19 +537,27 @@ class RealtimeKitAgent: ), ] ) + + # Ensure the received message is of the correct type assert isinstance(start_session_message, messages.StartSession) + + # Print session information print( f"Session started: {start_session_message.session.id} model: {start_session_message.session.model}" ) + # Create an instance of the agent agent = cls( client=client, tools=tools, channel=channel, ) + + # Run the agent await agent.run() finally: + # Ensure disconnection and shutdown occur, even if an exception is raised await engine.disconnect() await shutdown(asyncio.get_event_loop()) @@ -504,6 +569,7 @@ class RealtimeKitAgent: inference_config: InferenceConfig, tools: ToolContext | None = None, ) -> None: + """Entry point for setting up and running the agent.""" await cls.setup_and_run_agent( engine=engine, inference_config=inference_config, tools=tools ) @@ -515,33 +581,37 @@ class RealtimeKitAgent: tools: ToolContext | None, channel: Channel, ) -> None: + """Initialize the RealtimeKitAgent.""" self.client = client self.tools = tools self._client_tool_futures = {} self.channel = channel async def run(self) -> None: + """Main loop for running the agent.""" def log_exception(t: asyncio.Task[Any]) -> None: + """Log unhandled exceptions from tasks.""" if not t.cancelled() and t.exception(): logger.error( - "unhandled exception", + "Unhandled exception", exc_info=t.exception(), ) disconnected_future = asyncio.Future[None]() def _on_disconnected() -> None: + """Callback for when the agent is disconnected.""" if not disconnected_future.done(): disconnected_future.set_result(None) # self.room.on("disconnected", _on_disconnected) + # Create and monitor tasks for streaming audio and processing messages asyncio.create_task(self._stream_input_audio_to_model()).add_done_callback( log_exception ) - asyncio.create_task( - self._stream_audio_queue_to_audio_output() - ).add_done_callback(log_exception) + asyncio.create_task(self._stream_audio_queue_to_audio_output()).add_done_callback( + log_exception) asyncio.create_task(self._process_model_messages()).add_done_callback( log_exception @@ -551,116 +621,129 @@ class RealtimeKitAgent: logger.info("Agent finished running") async def _stream_input_audio_to_model(self) -> None: + """Stream input audio frames to the model.""" audio_frames = self.channel.get_audio_frames() async for audio_frame in audio_frames: # send the frame to the model via the API client await self.client.send_audio_data(audio_frame.data) async def _stream_audio_queue_to_audio_output(self) -> None: + """Stream audio from the queue to the audio output.""" while True: # audio queue contains audio data from the model, send it the end-user via our local audio source frame = await self.audio_queue.get() await self.channel.push_audio_frame(frame) - await asyncio.sleep(0) # allow other tasks to run - + await asyncio.sleep(0) # allow other tasks to run async def _process_model_messages(self) -> None: + """Process messages received from the model.""" async for message in self.client.listen(): match message: case messages.ResonseAudioDelta(): - # logger.info("Received audio message") + # Process incoming audio data await self.audio_queue.put(base64.b64decode(message.delta)) case messages.ResonseAudioTranscriptionDelta(): - logger.info(f"Received text message {message=}") + logger.info(f"Received text transcription delta: {message=}") await self.channel.chat.send_message(ChatMessage(message=message.delta, msg_id=message.output_item_id)) case messages.ResonseAudioTranscriptionDone(): - logger.info(f"Text message done: {message=}") + logger.info(f"Text transcription completed: {message=}") await self.channel.chat.send_message(ChatMessage(message=message.value, msg_id=message.output_item_id, done=True)) case messages.MessageAdded(): + # Handle message addition event pass + case messages.ServerAddMessage(): + # Handle server message addition event pass case messages.VADSpeechStarted(): + # Handle Voice Activity Detection speech start event pass + case messages.VADSpeechStopped(): + # Handle Voice Activity Detection speech stop event pass case messages.GenerationCanceled(): - logger.info(f"Server turn canceled: {message=}") + logger.info(f"Server generation canceled: {message=}") case messages.GenerationFinished(): - # TODO this is where we mark no longer appending text - logger.info(f"Server turn finished: {message=}") - # await self.channel.generation_finished() + logger.info(f"Server generation finished: {message=}") + # TODO: Implement logic to mark the end of text appending case messages.AddContent(type=messages.AddContentType.TOOL_CALL): - # TODO implement streaming tool calls - logger.info(f"Received tool call buffer add {message=}") + logger.info(f"Received tool call content: {message=}") + # TODO: Implement streaming tool calls case messages.RealtimeError(error=error): - # TODO do we have to stop the session here? - logger.error(f"Received error message {error=}") + logger.error(f"Received error message: {error=}") + # TODO: Determine if session termination is necessary case _: - logger.warning(f"Unhandled message {message=}") + logger.warning(f"Unhandled message type: {message=}") async def shutdown(loop, signal=None): """Gracefully shut down the application.""" if signal: print(f"Received exit signal {signal.name}...") - - tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()] - - print(f"Cancelling {len(tasks)} outstanding tasks") - for task in tasks: - task.cancel() - await asyncio.gather(*tasks, return_exceptions=True) - loop.stop() + tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()] + + print(f"Cancelling {len(tasks)} outstanding tasks") + for task in tasks: + task.cancel() + + await asyncio.gather(*tasks, return_exceptions=True) + loop.stop() -if __name__ == "__main__": +if __name__ == "__main__": # Load environment variables and run the agent load_dotenv() asyncio.run( RealtimeKitAgent.entry_point( engine=RtcEngine(appid="aab8b8f5a8cd4469a63042fcfafe7063"), inference_config=InferenceConfig( - system_message="""\ -You are a helpful assistant. If asked about the weather make sure to use the provided tool to get that information. \ -If you are asked a question that requires a tool, say something like "working on that" and dont provide a concrete response \ -until you have received the response to the tool call.\ + system_message="""\\ +You are a helpful assistant. If asked about the weather, make sure to use the provided tool to get that information. \\ +If you are asked a question that requires a tool, say something like "working on that" and don't provide a concrete response \\ +until you have received the response to the tool call.\\ """, voice=messages.Voices.Alloy, turn_detection=messages.TurnDetectionTypes.SERVER_VAD, ), ) - )`} + ) +`} + -The `agent.py` imports key classes from `rtc.py`, a wrapper around the Agora Python Voice SDK. For SDK setup and dependencies, refer to [Voice calling quickstart](/voice-calling/get-started/get-started-sdk?platform=python). -Following is the complete code for `rtc.py`. +The `agent.py` imports key classes from `rtc.py`, which implements the server-side Agora Python Voice SDK,, facilitating communication and managing audio streams. For SDK setup and dependencies, refer to [Voice calling quickstart](/voice-calling/get-started/get-started-sdk?platform=python). + +Below is the complete code for `rtc.py`.
-Complete code for `rtc.py` - + Complete code for `rtc.py` +
-## Test your code +## Test the code -1. Update the values for `AGORA_APP_ID` and ` OPENAI_API_KEY` in the project's `.env` file. +1. **Update the values for** `AGORA_APP_ID` **and** `OPENAI_API_KEY` **in the project's** `.env` **file**. + This step ensures that the necessary credentials for Agora and OpenAI are correctly configured in your project. -2. Execute the following command to run your app: +2. **Execute the following command to run your app**: - ```bash - python3 agent.py - ``` + ```bash + python3 agent.py + ``` + + This command launches the `agent.py` script, initializing the Agora channel and the OpenAI API connection. ## Reference -This section contains content that completes the information on this page, or points you to documentation that explains other aspects to this product. +This section contains additional information or links to relevant documentation that complements the current page or explains other aspects of the product. -- [Voice calling quickstart (Python)](/voice-calling/get-started/get-started-sdk?platform=python) \ No newline at end of file +- [Voice calling quickstart (Python)](/voice-calling/get-started/get-started-sdk?platform=python) diff --git a/shared/video-sdk/get-started/get-started-sdk/project-implementation/python.mdx b/shared/video-sdk/get-started/get-started-sdk/project-implementation/python.mdx index 27fa285a6..7c956f7f6 100644 --- a/shared/video-sdk/get-started/get-started-sdk/project-implementation/python.mdx +++ b/shared/video-sdk/get-started/get-started-sdk/project-implementation/python.mdx @@ -23,7 +23,6 @@ from agora.rtc.local_user import LocalUser from agora.rtc.local_user_observer import IRTCLocalUserObserver from agora.rtc.rtc_connection import RTCConnection, RTCConnInfo from agora.rtc.rtc_connection_observer import IRTCConnectionObserver -from pyee.asyncio import AsyncIOEventEmitter ``` ### Initialize the engine @@ -52,6 +51,10 @@ class RtcEngine: To asynchronously join a channel, implement a `Channel` class. When you create an instance of the class, the initializer sets up the necessary components for joining a channel. It takes an instance of `RtcEngine`, a `channelId`, and a `uid` as parameters. During initialization, the code creates an event emitter, configures the connection for broadcasting, and registers an event observer for channel events. It also sets up the local user’s audio configuration to enable audio streaming. + +UIDs in the Python SDK are set using a string value. Agora recommends using only numerical values for UID strings to ensure compatibility with all Agora products and extensions. + + ```python class Channel(): def __init__( diff --git a/shared/video-sdk/get-started/get-started-sdk/project-setup/python.mdx b/shared/video-sdk/get-started/get-started-sdk/project-setup/python.mdx index d7bae4b00..564569bdf 100644 --- a/shared/video-sdk/get-started/get-started-sdk/project-setup/python.mdx +++ b/shared/video-sdk/get-started/get-started-sdk/project-setup/python.mdx @@ -12,10 +12,15 @@ pip3 install pyee ``` -1. Install the SDK. +1. Install the server side SDK. ``` pip3 install agora-python-server-sdk ``` + + The Python SDK is a server side SDK. + + + \ No newline at end of file diff --git a/shared/video-sdk/get-started/get-started-sdk/project-test/python.mdx b/shared/video-sdk/get-started/get-started-sdk/project-test/python.mdx index 2ca2505aa..a684aea31 100644 --- a/shared/video-sdk/get-started/get-started-sdk/project-test/python.mdx +++ b/shared/video-sdk/get-started/get-started-sdk/project-test/python.mdx @@ -2,9 +2,9 @@ Follow these steps to test the demo code: -1. Create a file named `rtc.py` and paste the [complete source code](#complete-source-code) into this file. +1. Create a file named `rtc.py` and paste the [complete source code](#complete-code) into this file. -1. Create a file named `test_rtc.py` in the same folder as `rtc.py` and copy the following code to the file: +1. Create a file named `main.py` in the same folder as `rtc.py` and copy the following code to the file: ```python import asyncio @@ -38,7 +38,7 @@ Follow these steps to test the demo code: 1. To run the app, execute the following command in your terminal: ```bash - python3 run_rtc.py + python3 main.py ``` You see output similar to the following: