From 2f8e879976384a93b333afdf8df21d9fa3af9265 Mon Sep 17 00:00:00 2001
From: Enrico Ros <enrico.ros@gmail.com>
Date: Tue, 23 Apr 2024 01:45:27 -0700
Subject: [PATCH] Llms: fix Streaming timeouts

---
 .../llms/server/llm.server.streaming.ts       | 23 +++++++--
 .../llms/vendors/unifiedStreamingClient.ts    | 50 +++++++++++++------
 2 files changed, 53 insertions(+), 20 deletions(-)
diff --git a/src/modules/llms/server/llm.server.streaming.ts b/src/modules/llms/server/llm.server.streaming.ts
index f4e3b4d82..1d056e6ae 100644
--- a/src/modules/llms/server/llm.server.streaming.ts
+++ b/src/modules/llms/server/llm.server.streaming.ts
@@ -54,10 +54,18 @@ const chatStreamingInputSchema = z.object({
 });
 export type ChatStreamingInputSchema = z.infer<typeof chatStreamingInputSchema>;
 
+// the purpose is to send something out even before the upstream stream starts, so that we keep the connection up
+const chatStreamingStartOutputPacketSchema = z.object({
+  type: z.enum(['start']),
+});
+export type ChatStreamingPreambleStartSchema = z.infer<typeof chatStreamingStartOutputPacketSchema>;
+
+// the purpose is to have a first packet that contains the model name, so that the client can display it
+// this is a hack until we have a better streaming format
 const chatStreamingFirstOutputPacketSchema = z.object({
   model: z.string(),
 });
-export type ChatStreamingFirstOutputPacketSchema = z.infer<typeof chatStreamingFirstOutputPacketSchema>;
+export type ChatStreamingPreambleModelSchema = z.infer<typeof chatStreamingFirstOutputPacketSchema>;
 
 
 export async function llmStreamingRelayHandler(req: NextRequest): Promise<Response> {
@@ -147,6 +155,7 @@ export async function llmStreamingRelayHandler(req: NextRequest): Promise<Respon
   const transformUpstreamToBigAgiClient = createEventStreamTransformer(
     muxingFormat, vendorStreamParser, access.dialect,
   );
+
   const chatResponseStream =
     (upstreamResponse.body || createEmptyReadableStream())
       .pipeThrough(transformUpstreamToBigAgiClient);
@@ -206,6 +215,10 @@ function createEventStreamTransformer(muxingFormat: MuxingFormat, vendorTextPars
   return new TransformStream({
     start: async (controller): Promise<void> => {
 
+      // Send initial packet indicating the start of the stream
+      const startPacket: ChatStreamingPreambleStartSchema = { type: 'start' };
+      controller.enqueue(textEncoder.encode(JSON.stringify(startPacket)));
+
       // only used for debugging
       let debugLastMs: number | null = null;
 
@@ -293,7 +306,7 @@ function createStreamParserAnthropicMessages(): AIStreamParser {
         responseMessage = anthropicWireMessagesResponseSchema.parse(message);
         // hack: prepend the model name to the first packet
         if (firstMessage) {
-          const firstPacket: ChatStreamingFirstOutputPacketSchema = { model: responseMessage.model };
+          const firstPacket: ChatStreamingPreambleModelSchema = { model: responseMessage.model };
           text = JSON.stringify(firstPacket);
         }
         break;
@@ -408,7 +421,7 @@ function createStreamParserGemini(modelName: string): AIStreamParser {
     // hack: prepend the model name to the first packet
     if (!hasBegun) {
       hasBegun = true;
-      const firstPacket: ChatStreamingFirstOutputPacketSchema = { model: modelName };
+      const firstPacket: ChatStreamingPreambleModelSchema = { model: modelName };
       text = JSON.stringify(firstPacket) + text;
     }
 
@@ -444,7 +457,7 @@ function createStreamParserOllama(): AIStreamParser {
     // hack: prepend the model name to the first packet
     if (!hasBegun && chunk.model) {
       hasBegun = true;
-      const firstPacket: ChatStreamingFirstOutputPacketSchema = { model: chunk.model };
+      const firstPacket: ChatStreamingPreambleModelSchema = { model: chunk.model };
       text = JSON.stringify(firstPacket) + text;
     }
 
@@ -485,7 +498,7 @@ function createStreamParserOpenAI(): AIStreamParser {
     // hack: prepend the model name to the first packet
     if (!hasBegun) {
       hasBegun = true;
-      const firstPacket: ChatStreamingFirstOutputPacketSchema = { model: json.model };
+      const firstPacket: ChatStreamingPreambleModelSchema = { model: json.model };
       text = JSON.stringify(firstPacket) + text;
     }
 
diff --git a/src/modules/llms/vendors/unifiedStreamingClient.ts b/src/modules/llms/vendors/unifiedStreamingClient.ts
index a245fde91..07094baa6 100644
--- a/src/modules/llms/vendors/unifiedStreamingClient.ts
+++ b/src/modules/llms/vendors/unifiedStreamingClient.ts
@@ -1,7 +1,7 @@
 import { apiAsync } from '~/common/util/trpc.client';
 import { frontendSideFetch } from '~/common/util/clientFetchers';
 
-import type { ChatStreamingFirstOutputPacketSchema, ChatStreamingInputSchema } from '../server/llm.server.streaming';
+import type { ChatStreamingInputSchema, ChatStreamingPreambleModelSchema, ChatStreamingPreambleStartSchema } from '../server/llm.server.streaming';
 import type { DLLMId } from '../store-llms';
 import type { VChatFunctionIn, VChatMessageIn } from '../llm.client';
 
@@ -58,6 +58,7 @@ export async function unifiedStreamingClient<TSourceSetup = unknown, TLLMOptions
   };
 
   // connect to the server-side streaming endpoint
+  const timeFetch = performance.now();
   const response = await frontendSideFetch('/api/llms/stream', {
     method: 'POST',
     headers: { 'Content-Type': 'application/json' },
@@ -75,7 +76,8 @@ export async function unifiedStreamingClient<TSourceSetup = unknown, TLLMOptions
 
   // loop forever until the read is done, or the abort controller is triggered
   let incrementalText = '';
-  let parsedFirstPacket = false;
+  let parsedPreambleStart = false;
+  let parsedPreableModel = false;
   while (true) {
     const { value, done } = await responseReader.read();
 
@@ -88,21 +90,39 @@ export async function unifiedStreamingClient<TSourceSetup = unknown, TLLMOptions
 
     incrementalText += textDecoder.decode(value, { stream: true });
 
-    // (streaming workaround) there may be a JSON object at the beginning of the message,
-    // injected by us to transmit the model name
-    if (!parsedFirstPacket && incrementalText.startsWith('{')) {
+    // we have two packets with a serialized flat json object at the start; this is side data, before the text flow starts
+    while ((!parsedPreambleStart || !parsedPreableModel) && incrementalText.startsWith('{')) {
+
+      // extract a complete JSON object, if present
       const endOfJson = incrementalText.indexOf('}');
-      if (endOfJson === -1)
-        continue;
-      const json = incrementalText.substring(0, endOfJson + 1);
+      if (endOfJson === -1) break;
+      const jsonString = incrementalText.substring(0, endOfJson + 1);
       incrementalText = incrementalText.substring(endOfJson + 1);
-      parsedFirstPacket = true;
-      try {
-        const parsed: ChatStreamingFirstOutputPacketSchema = JSON.parse(json);
-        onUpdate({ originLLM: parsed.model }, false);
-      } catch (e) {
-        // error parsing JSON, ignore
-        console.log('unifiedStreamingClient: error parsing JSON:', e);
+
+      // first packet: preamble to let the Vercel edge function go over time
+      if (!parsedPreambleStart) {
+        parsedPreambleStart = true;
+        try {
+          const parsed: ChatStreamingPreambleStartSchema = JSON.parse(jsonString);
+          if (parsed.type !== 'start')
+            console.log('unifiedStreamingClient: unexpected preamble type:', parsed?.type, 'time:', performance.now() - timeFetch);
+        } catch (e) {
+          // error parsing JSON, ignore
+          console.log('unifiedStreamingClient: error parsing start JSON:', e);
+        }
+        continue;
+      }
+
+      // second packet: the model name
+      if (!parsedPreableModel) {
+        parsedPreableModel = true;
+        try {
+          const parsed: ChatStreamingPreambleModelSchema = JSON.parse(jsonString);
+          onUpdate({ originLLM: parsed.model }, false);
+        } catch (e) {
+          // error parsing JSON, ignore
+          console.log('unifiedStreamingClient: error parsing model JSON:', e);
+        }
       }
     }