enricoros · zoollcar · Oct 14, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 25, 2024
diff --git a/pages/info/debug.tsx b/pages/info/debug.tsx
@@ -20,7 +20,7 @@ import { ROUTE_APP_CHAT, ROUTE_INDEX } from '~/common/app.routes';
 import { incrementalNewsVersion, useAppNewsStateStore } from '../../src/apps/news/news.version';
 
 // capabilities access
-import { useCapabilityBrowserSpeechRecognition, useCapabilityElevenLabs, useCapabilityTextToImage } from '~/common/components/useCapabilities';
+import { useCapabilityBrowserSpeechRecognition, useVoiceCapability, useCapabilityTextToImage } from '~/common/components/useCapabilities';
 
 // stores access
 import { getLLMsDebugInfo } from '~/common/stores/llms/store-llms';
@@ -96,7 +96,7 @@ function AppDebug() {
   const cProduct = {
     capabilities: {
       mic: useCapabilityBrowserSpeechRecognition(),
-      elevenLabs: useCapabilityElevenLabs(),
+      elevenLabs: useVoiceCapability(),
       textToImage: useCapabilityTextToImage(),
     },
     models: getLLMsDebugInfo(),

diff --git a/src/apps/call/CallWizard.tsx b/src/apps/call/CallWizard.tsx
@@ -12,7 +12,7 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded';
 import { animationColorRainbow } from '~/common/util/animUtils';
 import { navigateBack } from '~/common/app.routes';
 import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
-import { useCapabilityBrowserSpeechRecognition, useCapabilityElevenLabs } from '~/common/components/useCapabilities';
+import { useCapabilityBrowserSpeechRecognition, useVoiceCapability } from '~/common/components/useCapabilities';
 import { useChatStore } from '~/common/stores/chat/store-chats';
 import { useUICounter } from '~/common/state/store-ui';
 
@@ -45,7 +45,7 @@ export function CallWizard(props: { strict?: boolean, conversationId: string | n
 
   // external state
   const recognition = useCapabilityBrowserSpeechRecognition();
-  const synthesis = useCapabilityElevenLabs();
+  const synthesis = useVoiceCapability();
   const chatIsEmpty = useChatStore(state => {
     if (!props.conversationId)
       return false;

diff --git a/src/apps/call/Telephone.tsx b/src/apps/call/Telephone.tsx
@@ -13,7 +13,7 @@ import { ScrollToBottom } from '~/common/scroll-to-bottom/ScrollToBottom';
 import { ScrollToBottomButton } from '~/common/scroll-to-bottom/ScrollToBottomButton';
 import { useChatLLMDropdown } from '../chat/components/layout-bar/useLLMDropdown';
 
-import { EXPERIMENTAL_speakTextStream } from '~/modules/elevenlabs/elevenlabs.client';
+import { EXPERIMENTAL_speakTextStream } from '~/common/components/useVoiceCapabilities';
 import { SystemPurposeId, SystemPurposes } from '../../data';
 import { llmStreamingChatGenerate, VChatMessageIn } from '~/modules/llms/llm.client';
 import { useElevenLabsVoiceDropdown } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
@@ -245,13 +245,22 @@ export function Telephone(props: {
     // perform completion
     responseAbortController.current = new AbortController();
     let finalText = '';
+    let currentSentence = '';
     let error: any | null = null;
     setPersonaTextInterim('💭...');
     llmStreamingChatGenerate(chatLLMId, callPrompt, 'call', callMessages[0].id, null, null, responseAbortController.current.signal, ({ textSoFar }) => {
       const text = textSoFar?.trim();
       if (text) {
-        finalText = text;
         setPersonaTextInterim(text);
+
+        // Maintain and say the current sentence
+        if (/[.,!?]$/.test(text)) {
+          currentSentence = text.substring(finalText?.length)
+          finalText = text
+          if (currentSentence?.length >= 1)
+            void EXPERIMENTAL_speakTextStream(currentSentence, personaVoiceId);
+        }
+        currentSentence = text.substring(finalText?.length) // to be added to the final text
       }
     }).catch((err: DOMException) => {
       if (err?.name !== 'AbortError')
@@ -261,8 +270,8 @@ export function Telephone(props: {
       if (finalText || error)
         setCallMessages(messages => [...messages, createDMessageTextContent('assistant', finalText + (error ? ` (ERROR: ${error.message || error.toString()})` : ''))]); // [state] append assistant:call_response
       // fire/forget
-      if (finalText?.length >= 1)
-        void EXPERIMENTAL_speakTextStream(finalText, personaVoiceId);
+      if (currentSentence?.length >= 1)
+        void EXPERIMENTAL_speakTextStream(currentSentence, personaVoiceId);
     });
 
     return () => {

diff --git a/src/apps/chat/AppChat.tsx b/src/apps/chat/AppChat.tsx
@@ -10,7 +10,7 @@ import { FlattenerModal } from '~/modules/aifn/flatten/FlattenerModal';
 import { TradeConfig, TradeModal } from '~/modules/trade/TradeModal';
 import { downloadSingleChat, importConversationsFromFilesAtRest, openConversationsAtRestPicker } from '~/modules/trade/trade.client';
 import { imaginePromptFromTextOrThrow } from '~/modules/aifn/imagine/imaginePromptFromText';
-import { speakText } from '~/modules/elevenlabs/elevenlabs.client';
+import { speakText } from '~/common/components/useVoiceCapabilities';
 import { useAreBeamsOpen } from '~/modules/beam/store-beam.hooks';
 import { useCapabilityTextToImage } from '~/modules/t2i/t2i.client';
 

diff --git a/src/apps/chat/components/ChatMessageList.tsx b/src/apps/chat/components/ChatMessageList.tsx
@@ -19,7 +19,7 @@ import { getConversation, useChatStore } from '~/common/stores/chat/store-chats'
 import { openFileForAttaching } from '~/common/components/ButtonAttachFiles';
 import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
 import { useBrowserTranslationWarning } from '~/common/components/useIsBrowserTranslating';
-import { useCapabilityElevenLabs } from '~/common/components/useCapabilities';
+import { useVoiceCapability } from '~/common/components/useCapabilities';
 import { useChatOverlayStore } from '~/common/chat-overlay/store-chat-overlay';
 import { useScrollToBottom } from '~/common/scroll-to-bottom/useScrollToBottom';
 
@@ -75,7 +75,7 @@ export function ChatMessageList(props: {
     _composerInReferenceToCount: state.inReferenceTo?.length ?? 0,
     ephemerals: state.ephemerals?.length ? state.ephemerals : null,
   })));
-  const { mayWork: isSpeakable } = useCapabilityElevenLabs();
+  const { mayWork: isSpeakable } = useVoiceCapability();
 
   // derived state
   const { conversationHandler, conversationId, capabilityHasT2I, onConversationBranch, onConversationExecuteHistory, onTextDiagram, onTextImagine, onTextSpeak } = props;

diff --git a/src/apps/chat/store-app-chat.ts b/src/apps/chat/store-app-chat.ts
@@ -1,6 +1,7 @@
 import { create } from 'zustand';
 import { persist } from 'zustand/middleware';
 import { useShallow } from 'zustand/react/shallow';
+import { ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities';
 
 import type { DLLMId } from '~/common/stores/llms/llms.types';
 
@@ -51,6 +52,12 @@ interface AppChatStore {
   micTimeoutMs: number;
   setMicTimeoutMs: (micTimeoutMs: number) => void;
 
+  TTSEngine: string;
+  setTTSEngine: (TTSEngine: string) => void;
+
+  ASREngine: string;
+  setASREngine: (ASREngine: string) => void;
+
   showPersonaIcons: boolean;
   setShowPersonaIcons: (showPersonaIcons: boolean) => void;
 
@@ -114,6 +121,12 @@ const useAppChatStore = create<AppChatStore>()(persist(
     micTimeoutMs: 2000,
     setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }),
 
+    TTSEngine: TTSEngineList[0],
+    setTTSEngine: (TTSEngine: string) => _set({ TTSEngine }),
+
+    ASREngine: ASREngineList[0],
+    setASREngine: (ASREngine: string) => _set({ ASREngine }),
+
     showPersonaIcons: true,
     setShowPersonaIcons: (showPersonaIcons: boolean) => _set({ showPersonaIcons }),
 
@@ -198,6 +211,13 @@ export const useChatMicTimeoutMsValue = (): number =>
 export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] =>
   useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs]));
 
+export const useTTSEngine = (): [string, (micTimeoutMs: string) => void] =>
+  useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine]));
+export const getTTSEngine = () => useAppChatStore.getState().TTSEngine;
+
+export const useASREngine = (): [string, (micTimeoutMs: string) => void] =>
+  useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine]));
+
 export const useChatDrawerFilters = () => {
   const values = useAppChatStore(useShallow(state => ({
     filterHasDocFragments: state.filterHasDocFragments,

diff --git a/src/apps/settings-modal/SettingsModal.tsx b/src/apps/settings-modal/SettingsModal.tsx
@@ -22,6 +22,9 @@ import { AppChatSettingsAI } from './AppChatSettingsAI';
 import { AppChatSettingsUI } from './settings-ui/AppChatSettingsUI';
 import { UxLabsSettings } from './UxLabsSettings';
 import { VoiceSettings } from './VoiceSettings';
+import { BrowserSpeechSettings } from '~/modules/browser/speech-synthesis/BrowserSpeechSettings';
+
+import { useTTSEngine } from 'src/apps/chat/store-app-chat';
 
 
 // styled <AccordionGroup variant='plain'> into a Topics component
@@ -122,6 +125,8 @@ export function SettingsModal(props: {
   // external state
   const isMobile = useIsMobile();
 
+  const [TTSEngine] = useTTSEngine()
+
   // handlers
 
   const { setTab } = props;
@@ -193,9 +198,12 @@ export function SettingsModal(props: {
             <Topic icon='🎙️' title='Voice settings'>
               <VoiceSettings />
             </Topic>
-            <Topic icon='📢' title='ElevenLabs API'>
+            {TTSEngine === 'Elevenlabs' && <Topic icon='📢' title='ElevenLabs API'>
               <ElevenlabsSettings />
-            </Topic>
+            </Topic>}
+            {TTSEngine === 'Web Speech API' && <Topic icon='📢' title='Web Speech API'>
+              <BrowserSpeechSettings />
+            </Topic>}
           </Topics>
         </TabPanel>
 

diff --git a/src/apps/settings-modal/VoiceSettings.tsx b/src/apps/settings-modal/VoiceSettings.tsx
@@ -2,24 +2,25 @@ import * as React from 'react';
 
 import { FormControl } from '@mui/joy';
 
-import { useChatAutoAI, useChatMicTimeoutMs } from '../chat/store-app-chat';
+import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat';
+
 
-import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
 
 import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
 import { FormRadioControl } from '~/common/components/forms/FormRadioControl';
 import { LanguageSelect } from '~/common/components/LanguageSelect';
 import { useIsMobile } from '~/common/components/useMatchMedia';
-
+import { hasVoices, ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities';
 
 export function VoiceSettings() {
 
   // external state
   const isMobile = useIsMobile();
   const { autoSpeak, setAutoSpeak } = useChatAutoAI();
-  const { hasVoices } = useElevenLabsVoices();
-  const [chatTimeoutMs, setChatTimeoutMs] = useChatMicTimeoutMs();
 
+  const [chatTimeoutMs, setChatTimeoutMs]  = useChatMicTimeoutMs();
+  const [TTSEngine, setTTSEngine ] = useTTSEngine();
+  const [ASREngine, setASREngine ] = useASREngine();
 
   // this converts from string keys to numbers and vice versa
   const chatTimeoutValue: string = '' + chatTimeoutMs;
@@ -59,5 +60,21 @@ export function VoiceSettings() {
       value={autoSpeak} onChange={setAutoSpeak}
     />
 
+    <FormRadioControl
+      title='TTS engine'
+      description='Text to speech'
+      tooltip=''
+      options={TTSEngineList.map((i) => ({ value: i, label: i }))}
+      value={TTSEngine} onChange={setTTSEngine}
+    />
+
+    <FormRadioControl
+      title='ASR engine'
+      description='Automatic Speech Recognition'
+      tooltip=''
+      options={ASREngineList.map((i) => ({ value: i, label: i }))}
+      value={ASREngine} onChange={setASREngine}
+    />
+
   </>;
 }
diff --git a/src/common/components/useCapabilities.ts b/src/common/components/useCapabilities.ts
@@ -22,15 +22,15 @@ export interface CapabilityBrowserSpeechRecognition {
 export { browserSpeechRecognitionCapability as useCapabilityBrowserSpeechRecognition } from './useSpeechRecognition';
 
 
-/// Speech Synthesis: ElevenLabs
+/// Speech Synthesis
 
-export interface CapabilityElevenLabsSpeechSynthesis {
+export interface CapabilitySpeechSynthesis {
   mayWork: boolean;
   isConfiguredServerSide: boolean;
   isConfiguredClientSide: boolean;
 }
 
-export { useCapability as useCapabilityElevenLabs } from '~/modules/elevenlabs/elevenlabs.client';
+export { useCapability as useVoiceCapability } from '~/common/components/useVoiceCapabilities';
 
 
 /// Image Generation

diff --git a/src/common/components/useVoiceCapabilities.ts b/src/common/components/useVoiceCapabilities.ts
@@ -0,0 +1,74 @@
+import { getTTSEngine } from 'src/apps/chat/store-app-chat';
+import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities';
+
+import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client'
+import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client'
+import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client'
+
+import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
+import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
+import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
+
+import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
+import { useBrowserSpeechVoices } from '~/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown';
+
+export const TTSEngineList: string[] = [
+  'Elevenlabs',
+  'Web Speech API'
+]
+
+export const ASREngineList: string[] = [
+  'Web Speech API'
+]
+
+export function getConditionalVoices(){
+  const TTSEngine = getTTSEngine();
+  if (TTSEngine === 'Elevenlabs') {
+    return useElevenLabsVoices
+  }else if (TTSEngine === 'Web Speech API') {
+    return useBrowserSpeechVoices
+  }
+  throw new Error('TTSEngine is not found');
+}
+
+export function hasVoices(): boolean {
+  console.log('getConditionalVoices', getConditionalVoices()().hasVoices)
+  return getConditionalVoices()().hasVoices;
+} 
+
+export function getConditionalCapability(): () => CapabilitySpeechSynthesis {
+  const TTSEngine = getTTSEngine();
+  if (TTSEngine === 'Elevenlabs') {
+    return useElevenlabsCapability
+  }else if (TTSEngine === 'Web Speech API') {
+    return useBrowserSpeechSynthesisCapability
+  }
+  throw new Error('TTSEngine is not found');
+}
+
+export function useCapability(): CapabilitySpeechSynthesis {
+  return getConditionalCapability()();
+}
+
+
+export async function speakText(text: string, voiceId?: string) {
+  const TTSEngine = getTTSEngine();
+  if (TTSEngine === 'Elevenlabs') {
+    return await elevenlabsSpeakText(text, voiceId);
+  }else if (TTSEngine === 'Web Speech API') {
+    return await browserSpeechSynthesisSpeakText(text, voiceId);
+  }
+  throw new Error('TTSEngine is not found'); 
+}
+
+// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined;
+
+export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
+  const TTSEngine = getTTSEngine();
+  if (TTSEngine === 'Elevenlabs') {
+    return await EXPERIMENTAL_elevenlabsSpeakTextStream(text, voiceId);
+  }else if (TTSEngine === 'Web Speech API') {
+    return await EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream(text, voiceId);
+  }
+  throw new Error('TTSEngine is not found'); 
+}