From c78694f1931cbaeeaa7958fbe4af80192022ebce Mon Sep 17 00:00:00 2001
From: zoollcar <zoollcar@qq.com>
Date: Mon, 14 Oct 2024 13:41:28 +0800
Subject: [PATCH 1/3] TTS for version2

---
 pages/info/debug.tsx                          |   4 +-
 src/apps/call/CallWizard.tsx                  |   4 +-
 src/apps/call/Telephone.tsx                   |  17 ++-
 src/apps/chat/AppChat.tsx                     |   2 +-
 src/apps/chat/components/ChatMessageList.tsx  |   4 +-
 src/apps/chat/store-app-chat.ts               |  20 +++
 src/apps/settings-modal/SettingsModal.tsx     |  12 +-
 src/apps/settings-modal/VoiceSettings.tsx     |  27 +++-
 src/common/components/useCapabilities.ts      |   6 +-
 src/common/components/useVoiceCapabilities.ts |  74 +++++++++++
 .../BrowserSpeechSettings.tsx                 | 111 ++++++++++++++++
 .../browser.speechSynthesis.client.ts         |  48 +++++++
 .../speech-synthesis/preSelect/Languages.json |  75 +++++++++++
 .../speech-synthesis/store-module-browser.tsx |  40 ++++++
 .../useBrowserSpeechVoiceDropdown.tsx         | 124 ++++++++++++++++++
 src/modules/elevenlabs/ElevenlabsSettings.tsx |   4 +-
 src/modules/elevenlabs/elevenlabs.client.ts   |   4 +-
 .../elevenlabs/useElevenLabsVoiceDropdown.tsx |   4 +
 18 files changed, 555 insertions(+), 25 deletions(-)
 create mode 100644 src/common/components/useVoiceCapabilities.ts
 create mode 100644 src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx
 create mode 100644 src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts
 create mode 100644 src/modules/browser/speech-synthesis/preSelect/Languages.json
 create mode 100644 src/modules/browser/speech-synthesis/store-module-browser.tsx
 create mode 100644 src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx

diff --git a/pages/info/debug.tsx b/pages/info/debug.tsx
index 066e70b9a..27ffbbe49 100644
--- a/pages/info/debug.tsx
+++ b/pages/info/debug.tsx
@@ -20,7 +20,7 @@ import { ROUTE_APP_CHAT, ROUTE_INDEX } from '~/common/app.routes';
 import { incrementalNewsVersion, useAppNewsStateStore } from '../../src/apps/news/news.version';
 
 // capabilities access
-import { useCapabilityBrowserSpeechRecognition, useCapabilityElevenLabs, useCapabilityTextToImage } from '~/common/components/useCapabilities';
+import { useCapabilityBrowserSpeechRecognition, useVoiceCapability, useCapabilityTextToImage } from '~/common/components/useCapabilities';
 
 // stores access
 import { getLLMsDebugInfo } from '~/common/stores/llms/store-llms';
@@ -96,7 +96,7 @@ function AppDebug() {
   const cProduct = {
     capabilities: {
       mic: useCapabilityBrowserSpeechRecognition(),
-      elevenLabs: useCapabilityElevenLabs(),
+      elevenLabs: useVoiceCapability(),
       textToImage: useCapabilityTextToImage(),
     },
     models: getLLMsDebugInfo(),
diff --git a/src/apps/call/CallWizard.tsx b/src/apps/call/CallWizard.tsx
index ab8a7ad6f..d7bdb767a 100644
--- a/src/apps/call/CallWizard.tsx
+++ b/src/apps/call/CallWizard.tsx
@@ -12,7 +12,7 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded';
 import { animationColorRainbow } from '~/common/util/animUtils';
 import { navigateBack } from '~/common/app.routes';
 import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
-import { useCapabilityBrowserSpeechRecognition, useCapabilityElevenLabs } from '~/common/components/useCapabilities';
+import { useCapabilityBrowserSpeechRecognition, useVoiceCapability } from '~/common/components/useCapabilities';
 import { useChatStore } from '~/common/stores/chat/store-chats';
 import { useUICounter } from '~/common/state/store-ui';
 
@@ -45,7 +45,7 @@ export function CallWizard(props: { strict?: boolean, conversationId: string | n
 
   // external state
   const recognition = useCapabilityBrowserSpeechRecognition();
-  const synthesis = useCapabilityElevenLabs();
+  const synthesis = useVoiceCapability();
   const chatIsEmpty = useChatStore(state => {
     if (!props.conversationId)
       return false;
diff --git a/src/apps/call/Telephone.tsx b/src/apps/call/Telephone.tsx
index ebbdd18bc..ba0b108b1 100644
--- a/src/apps/call/Telephone.tsx
+++ b/src/apps/call/Telephone.tsx
@@ -13,7 +13,7 @@ import { ScrollToBottom } from '~/common/scroll-to-bottom/ScrollToBottom';
 import { ScrollToBottomButton } from '~/common/scroll-to-bottom/ScrollToBottomButton';
 import { useChatLLMDropdown } from '../chat/components/layout-bar/useLLMDropdown';
 
-import { EXPERIMENTAL_speakTextStream } from '~/modules/elevenlabs/elevenlabs.client';
+import { EXPERIMENTAL_speakTextStream } from '~/common/components/useVoiceCapabilities';
 import { SystemPurposeId, SystemPurposes } from '../../data';
 import { llmStreamingChatGenerate, VChatMessageIn } from '~/modules/llms/llm.client';
 import { useElevenLabsVoiceDropdown } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
@@ -245,13 +245,22 @@ export function Telephone(props: {
     // perform completion
     responseAbortController.current = new AbortController();
     let finalText = '';
+    let currentSentence = '';
     let error: any | null = null;
     setPersonaTextInterim('💭...');
     llmStreamingChatGenerate(chatLLMId, callPrompt, 'call', callMessages[0].id, null, null, responseAbortController.current.signal, ({ textSoFar }) => {
       const text = textSoFar?.trim();
       if (text) {
-        finalText = text;
         setPersonaTextInterim(text);
+
+        // Maintain and say the current sentence
+        if (/[.,!?]$/.test(text)) {
+          currentSentence = text.substring(finalText?.length)
+          finalText = text
+          if (currentSentence?.length >= 1)
+            void EXPERIMENTAL_speakTextStream(currentSentence, personaVoiceId);
+        }
+        currentSentence = text.substring(finalText?.length) // to be added to the final text
       }
     }).catch((err: DOMException) => {
       if (err?.name !== 'AbortError')
@@ -261,8 +270,8 @@ export function Telephone(props: {
       if (finalText || error)
         setCallMessages(messages => [...messages, createDMessageTextContent('assistant', finalText + (error ? ` (ERROR: ${error.message || error.toString()})` : ''))]); // [state] append assistant:call_response
       // fire/forget
-      if (finalText?.length >= 1)
-        void EXPERIMENTAL_speakTextStream(finalText, personaVoiceId);
+      if (currentSentence?.length >= 1)
+        void EXPERIMENTAL_speakTextStream(currentSentence, personaVoiceId);
     });
 
     return () => {
diff --git a/src/apps/chat/AppChat.tsx b/src/apps/chat/AppChat.tsx
index 0c3d17b52..a6107ec1a 100644
--- a/src/apps/chat/AppChat.tsx
+++ b/src/apps/chat/AppChat.tsx
@@ -10,7 +10,7 @@ import { FlattenerModal } from '~/modules/aifn/flatten/FlattenerModal';
 import { TradeConfig, TradeModal } from '~/modules/trade/TradeModal';
 import { downloadSingleChat, importConversationsFromFilesAtRest, openConversationsAtRestPicker } from '~/modules/trade/trade.client';
 import { imaginePromptFromTextOrThrow } from '~/modules/aifn/imagine/imaginePromptFromText';
-import { speakText } from '~/modules/elevenlabs/elevenlabs.client';
+import { speakText } from '~/common/components/useVoiceCapabilities';
 import { useAreBeamsOpen } from '~/modules/beam/store-beam.hooks';
 import { useCapabilityTextToImage } from '~/modules/t2i/t2i.client';
 
diff --git a/src/apps/chat/components/ChatMessageList.tsx b/src/apps/chat/components/ChatMessageList.tsx
index 8c6cec862..0111b564f 100644
--- a/src/apps/chat/components/ChatMessageList.tsx
+++ b/src/apps/chat/components/ChatMessageList.tsx
@@ -19,7 +19,7 @@ import { getConversation, useChatStore } from '~/common/stores/chat/store-chats'
 import { openFileForAttaching } from '~/common/components/ButtonAttachFiles';
 import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
 import { useBrowserTranslationWarning } from '~/common/components/useIsBrowserTranslating';
-import { useCapabilityElevenLabs } from '~/common/components/useCapabilities';
+import { useVoiceCapability } from '~/common/components/useCapabilities';
 import { useChatOverlayStore } from '~/common/chat-overlay/store-chat-overlay';
 import { useScrollToBottom } from '~/common/scroll-to-bottom/useScrollToBottom';
 
@@ -75,7 +75,7 @@ export function ChatMessageList(props: {
     _composerInReferenceToCount: state.inReferenceTo?.length ?? 0,
     ephemerals: state.ephemerals?.length ? state.ephemerals : null,
   })));
-  const { mayWork: isSpeakable } = useCapabilityElevenLabs();
+  const { mayWork: isSpeakable } = useVoiceCapability();
 
   // derived state
   const { conversationHandler, conversationId, capabilityHasT2I, onConversationBranch, onConversationExecuteHistory, onTextDiagram, onTextImagine, onTextSpeak } = props;
diff --git a/src/apps/chat/store-app-chat.ts b/src/apps/chat/store-app-chat.ts
index f3fcc163e..4760fe161 100644
--- a/src/apps/chat/store-app-chat.ts
+++ b/src/apps/chat/store-app-chat.ts
@@ -1,6 +1,7 @@
 import { create } from 'zustand';
 import { persist } from 'zustand/middleware';
 import { useShallow } from 'zustand/react/shallow';
+import { ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities';
 
 import type { DLLMId } from '~/common/stores/llms/llms.types';
 
@@ -51,6 +52,12 @@ interface AppChatStore {
   micTimeoutMs: number;
   setMicTimeoutMs: (micTimeoutMs: number) => void;
 
+  TTSEngine: string;
+  setTTSEngine: (TTSEngine: string) => void;
+
+  ASREngine: string;
+  setASREngine: (ASREngine: string) => void;
+
   showPersonaIcons: boolean;
   setShowPersonaIcons: (showPersonaIcons: boolean) => void;
 
@@ -114,6 +121,12 @@ const useAppChatStore = create<AppChatStore>()(persist(
     micTimeoutMs: 2000,
     setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }),
 
+    TTSEngine: TTSEngineList[0],
+    setTTSEngine: (TTSEngine: string) => _set({ TTSEngine }),
+
+    ASREngine: ASREngineList[0],
+    setASREngine: (ASREngine: string) => _set({ ASREngine }),
+
     showPersonaIcons: true,
     setShowPersonaIcons: (showPersonaIcons: boolean) => _set({ showPersonaIcons }),
 
@@ -198,6 +211,13 @@ export const useChatMicTimeoutMsValue = (): number =>
 export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] =>
   useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs]));
 
+export const useTTSEngine = (): [string, (micTimeoutMs: string) => void] =>
+  useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine]));
+export const getTTSEngine = () => useAppChatStore.getState().TTSEngine;
+
+export const useASREngine = (): [string, (micTimeoutMs: string) => void] =>
+  useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine]));
+
 export const useChatDrawerFilters = () => {
   const values = useAppChatStore(useShallow(state => ({
     filterHasDocFragments: state.filterHasDocFragments,
diff --git a/src/apps/settings-modal/SettingsModal.tsx b/src/apps/settings-modal/SettingsModal.tsx
index 3a5742e4c..2af374cac 100644
--- a/src/apps/settings-modal/SettingsModal.tsx
+++ b/src/apps/settings-modal/SettingsModal.tsx
@@ -22,6 +22,9 @@ import { AppChatSettingsAI } from './AppChatSettingsAI';
 import { AppChatSettingsUI } from './settings-ui/AppChatSettingsUI';
 import { UxLabsSettings } from './UxLabsSettings';
 import { VoiceSettings } from './VoiceSettings';
+import { BrowserSpeechSettings } from '~/modules/browser/speech-synthesis/BrowserSpeechSettings';
+
+import { useTTSEngine } from 'src/apps/chat/store-app-chat';
 
 
 // styled <AccordionGroup variant='plain'> into a Topics component
@@ -122,6 +125,8 @@ export function SettingsModal(props: {
   // external state
   const isMobile = useIsMobile();
 
+  const [TTSEngine] = useTTSEngine()
+
   // handlers
 
   const { setTab } = props;
@@ -193,9 +198,12 @@ export function SettingsModal(props: {
             <Topic icon='🎙️' title='Voice settings'>
               <VoiceSettings />
             </Topic>
-            <Topic icon='📢' title='ElevenLabs API'>
+            {TTSEngine === 'Elevenlabs' && <Topic icon='📢' title='ElevenLabs API'>
               <ElevenlabsSettings />
-            </Topic>
+            </Topic>}
+            {TTSEngine === 'Web Speech API' && <Topic icon='📢' title='Web Speech API'>
+              <BrowserSpeechSettings />
+            </Topic>}
           </Topics>
         </TabPanel>
 
diff --git a/src/apps/settings-modal/VoiceSettings.tsx b/src/apps/settings-modal/VoiceSettings.tsx
index 404f15c59..97712ad2c 100644
--- a/src/apps/settings-modal/VoiceSettings.tsx
+++ b/src/apps/settings-modal/VoiceSettings.tsx
@@ -2,24 +2,25 @@ import * as React from 'react';
 
 import { FormControl } from '@mui/joy';
 
-import { useChatAutoAI, useChatMicTimeoutMs } from '../chat/store-app-chat';
+import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat';
+
 
-import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
 
 import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
 import { FormRadioControl } from '~/common/components/forms/FormRadioControl';
 import { LanguageSelect } from '~/common/components/LanguageSelect';
 import { useIsMobile } from '~/common/components/useMatchMedia';
-
+import { hasVoices, ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities';
 
 export function VoiceSettings() {
 
   // external state
   const isMobile = useIsMobile();
   const { autoSpeak, setAutoSpeak } = useChatAutoAI();
-  const { hasVoices } = useElevenLabsVoices();
-  const [chatTimeoutMs, setChatTimeoutMs] = useChatMicTimeoutMs();
 
+  const [chatTimeoutMs, setChatTimeoutMs]  = useChatMicTimeoutMs();
+  const [TTSEngine, setTTSEngine ] = useTTSEngine();
+  const [ASREngine, setASREngine ] = useASREngine();
 
   // this converts from string keys to numbers and vice versa
   const chatTimeoutValue: string = '' + chatTimeoutMs;
@@ -59,5 +60,21 @@ export function VoiceSettings() {
       value={autoSpeak} onChange={setAutoSpeak}
     />
 
+    <FormRadioControl
+      title='TTS engine'
+      description='Text to speech'
+      tooltip=''
+      options={TTSEngineList.map((i) => ({ value: i, label: i }))}
+      value={TTSEngine} onChange={setTTSEngine}
+    />
+
+    <FormRadioControl
+      title='ASR engine'
+      description='Automatic Speech Recognition'
+      tooltip=''
+      options={ASREngineList.map((i) => ({ value: i, label: i }))}
+      value={ASREngine} onChange={setASREngine}
+    />
+
   </>;
 }
\ No newline at end of file
diff --git a/src/common/components/useCapabilities.ts b/src/common/components/useCapabilities.ts
index 33a1be905..52b9facef 100644
--- a/src/common/components/useCapabilities.ts
+++ b/src/common/components/useCapabilities.ts
@@ -22,15 +22,15 @@ export interface CapabilityBrowserSpeechRecognition {
 export { browserSpeechRecognitionCapability as useCapabilityBrowserSpeechRecognition } from './useSpeechRecognition';
 
 
-/// Speech Synthesis: ElevenLabs
+/// Speech Synthesis
 
-export interface CapabilityElevenLabsSpeechSynthesis {
+export interface CapabilitySpeechSynthesis {
   mayWork: boolean;
   isConfiguredServerSide: boolean;
   isConfiguredClientSide: boolean;
 }
 
-export { useCapability as useCapabilityElevenLabs } from '~/modules/elevenlabs/elevenlabs.client';
+export { useCapability as useVoiceCapability } from '~/common/components/useVoiceCapabilities';
 
 
 /// Image Generation
diff --git a/src/common/components/useVoiceCapabilities.ts b/src/common/components/useVoiceCapabilities.ts
new file mode 100644
index 000000000..ad1cc253a
--- /dev/null
+++ b/src/common/components/useVoiceCapabilities.ts
@@ -0,0 +1,74 @@
+import { getTTSEngine } from 'src/apps/chat/store-app-chat';
+import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities';
+
+import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client'
+import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client'
+import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client'
+
+import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
+import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
+import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
+
+import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
+import { useBrowserSpeechVoices } from '~/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown';
+
+export const TTSEngineList: string[] = [
+  'Elevenlabs',
+  'Web Speech API'
+]
+
+export const ASREngineList: string[] = [
+  'Web Speech API'
+]
+
+export function getConditionalVoices(){
+  const TTSEngine = getTTSEngine();
+  if (TTSEngine === 'Elevenlabs') {
+    return useElevenLabsVoices
+  }else if (TTSEngine === 'Web Speech API') {
+    return useBrowserSpeechVoices
+  }
+  throw new Error('TTSEngine is not found');
+}
+
+export function hasVoices(): boolean {
+  console.log('getConditionalVoices', getConditionalVoices()().hasVoices)
+  return getConditionalVoices()().hasVoices;
+} 
+
+export function getConditionalCapability(): () => CapabilitySpeechSynthesis {
+  const TTSEngine = getTTSEngine();
+  if (TTSEngine === 'Elevenlabs') {
+    return useElevenlabsCapability
+  }else if (TTSEngine === 'Web Speech API') {
+    return useBrowserSpeechSynthesisCapability
+  }
+  throw new Error('TTSEngine is not found');
+}
+
+export function useCapability(): CapabilitySpeechSynthesis {
+  return getConditionalCapability()();
+}
+
+
+export async function speakText(text: string, voiceId?: string) {
+  const TTSEngine = getTTSEngine();
+  if (TTSEngine === 'Elevenlabs') {
+    return await elevenlabsSpeakText(text, voiceId);
+  }else if (TTSEngine === 'Web Speech API') {
+    return await browserSpeechSynthesisSpeakText(text, voiceId);
+  }
+  throw new Error('TTSEngine is not found'); 
+}
+
+// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined;
+
+export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
+  const TTSEngine = getTTSEngine();
+  if (TTSEngine === 'Elevenlabs') {
+    return await EXPERIMENTAL_elevenlabsSpeakTextStream(text, voiceId);
+  }else if (TTSEngine === 'Web Speech API') {
+    return await EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream(text, voiceId);
+  }
+  throw new Error('TTSEngine is not found'); 
+}
\ No newline at end of file
diff --git a/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx b/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx
new file mode 100644
index 000000000..4a6fc6441
--- /dev/null
+++ b/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx
@@ -0,0 +1,111 @@
+import * as React from 'react';
+
+import { Option, FormControl, Select, Switch, Typography, Box, IconButton } from '@mui/joy';
+import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
+import CloseRounded from '@mui/icons-material/CloseRounded';
+import { addSnackbar } from '~/common/components/snackbar/useSnackbarsStore';
+
+import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
+import { useBrowserSpeechVoiceDropdown } from './useBrowserSpeechVoiceDropdown';
+import { useLanguageCodeForFilter } from './store-module-browser';
+
+// languages are defined as a JSON file
+import languages from './preSelect/Languages.json';
+
+export function BrowserSpeechSettings() {
+  // state
+  const [testUtterance, setTestUtterance] = React.useState<string | null>(null);
+  const [voiceNameFilters, setVoiceNameFilters] = React.useState<string[] | null>(null);
+
+  // external state
+  const [languageCode, setLanguageCode] = useLanguageCodeForFilter();
+
+  React.useEffect(() => {
+    if (languageCode) {
+      const fetchFunction = async () => {
+        let res = await fetch(`https://raw.githubusercontent.com/HadrienGardeur/web-speech-recommended-voices/refs/heads/main/json/${languageCode}.json`);
+        let data = await res.json();
+        let voices = data.voices;
+        voices = voices.filter((voice: any) => {
+          return voice.quality.includes('high') || voice.quality.includes('veryHigh');
+        });
+        let voiceNameFilters = voices.map((voice: any) => voice.name);
+        setTestUtterance(data.testUtterance);
+        setVoiceNameFilters(voiceNameFilters);
+      };
+      fetchFunction().catch((err) => {
+        console.log('Error getting voice list: ', err);
+        addSnackbar({ key: 'browser-speech-synthesis', message: 'Error getting voice list', type: 'issue' });
+        setTestUtterance(null);
+        setVoiceNameFilters(null);
+        setLanguageCode('');
+      });
+    } else {
+      setTestUtterance(null);
+      setVoiceNameFilters(null);
+    }
+  }, [languageCode, setLanguageCode]);
+
+  const { voicesDropdown } = useBrowserSpeechVoiceDropdown(true, { voiceNameFilters, testUtterance });
+
+  const languageOptions = React.useMemo(() => {
+    return Object.entries(languages)
+      .sort((a, b) => {
+        return a[1].localeCompare(b[1]);
+      })
+      .map(([languageCode, languageName]) => (
+        <Option key={languageCode} value={languageCode}>
+          {`${languageName}`}
+        </Option>
+      ));
+  }, []);
+
+  function handleLanguageChanged(_event: any, newValue: string | null) {
+    setLanguageCode(newValue || '');
+  }
+
+  return (
+    <>
+      <FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
+        <FormLabelStart title="pre-select" description="pre-selected high quality voices" tooltip="" />
+        <Select
+          value={languageCode}
+          onChange={handleLanguageChanged}
+          indicator={<KeyboardArrowDownIcon />}
+          placeholder="Choose one…"
+          slotProps={{
+            root: { sx: { minWidth: 200 } },
+            indicator: { sx: { opacity: 0.5 } },
+          }}
+          {...(languageCode && {
+            // display the button and remove select indicator
+            // when user has selected a value
+            endDecorator: (
+              <IconButton
+                size="sm"
+                variant="plain"
+                color="neutral"
+                onMouseDown={(event) => {
+                  // don't open the popup when clicking on this button
+                  event.stopPropagation();
+                }}
+                onClick={() => {
+                  setLanguageCode('');
+                }}
+              >
+                <CloseRounded />
+              </IconButton>
+            ),
+            indicator: null,
+          })}
+        >
+          {languageOptions}
+        </Select>
+      </FormControl>
+      <FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
+        <FormLabelStart title="Assistant Voice" />
+        {voicesDropdown}
+      </FormControl>
+    </>
+  );
+}
diff --git a/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts b/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts
new file mode 100644
index 000000000..2814a760a
--- /dev/null
+++ b/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts
@@ -0,0 +1,48 @@
+import { CapabilitySpeechSynthesis } from "~/common/components/useCapabilities";
+import { getBrowseVoiceId } from "./store-module-browser";
+
+export function useCapability(): CapabilitySpeechSynthesis {
+  const synth = window.speechSynthesis;
+  const voices = synth.getVoices();
+  const isConfiguredServerSide = false;
+  const isConfiguredClientSide = true;
+  const mayWork = voices.length > 0;
+  return { mayWork, isConfiguredServerSide, isConfiguredClientSide };
+}
+
+
+export async function speakText(text: string, voiceId?: string) {
+  if (!(text?.trim())) return;
+  
+  try {
+    const synth = window.speechSynthesis;
+    const utterThis = new SpeechSynthesisUtterance(text);
+    const voices = synth.getVoices();
+    voiceId = voiceId || getBrowseVoiceId();
+    utterThis.voice = voices.find((voice) => voiceId === voice.name) || null;
+    synth.speak(utterThis);
+  } catch (error) {
+    console.error('Error playing first text:', error);
+  }
+}
+
+export async function cancel() {
+  const synth = window.speechSynthesis;
+  synth.cancel();
+}
+
+export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
+  if (!(text?.trim())) return;
+
+  try {
+    const synth = window.speechSynthesis;
+    const utterThis = new SpeechSynthesisUtterance(text);
+    const voices = synth.getVoices();
+    voiceId = voiceId || getBrowseVoiceId();
+    utterThis.voice = voices.find((voice) => voiceId === voice.name) || null;
+    synth.speak(utterThis);
+  } catch (error) {
+    // has happened once in months of testing, not sure what was the cause
+    console.error('EXPERIMENTAL_speakTextStream:', error);
+  }
+}
\ No newline at end of file
diff --git a/src/modules/browser/speech-synthesis/preSelect/Languages.json b/src/modules/browser/speech-synthesis/preSelect/Languages.json
new file mode 100644
index 000000000..a2b9ade45
--- /dev/null
+++ b/src/modules/browser/speech-synthesis/preSelect/Languages.json
@@ -0,0 +1,75 @@
+{
+  "ar": "Arabic",
+  "as": "Assamese",
+  "bg": "Bulgarian",
+  "bho": "Bhojpuri",
+  "bn": "Bangla",
+  "brx": "Bodo",
+  "bs": "Bosnian",
+  "ca": "Catalan",
+  "cmn": "Chinese",
+  "cs": "Czech",
+  "cy": "Welsh",
+  "da": "Danish",
+  "de": "German",
+  "doi": "Dogri",
+  "el": "Greek",
+  "en": "English",
+  "es": "Spanish",
+  "et": "Estonian",
+  "eu": "Basque",
+  "fa": "Persian",
+  "fi": "Finnish",
+  "fil": "Filipino",
+  "fr": "French",
+  "gl": "Galician",
+  "gu": "Gujarati",
+  "he": "Hebrew",
+  "hi": "Hindi",
+  "hr": "Croatian",
+  "hu": "Hungarian",
+  "id": "Indonesian",
+  "is": "Icelandic",
+  "it": "Italian",
+  "ja": "Japanese",
+  "jv": "Javanese",
+  "km": "khmer",
+  "kn": "Kannada",
+  "kok": "Konkani",
+  "ko": "Korean",
+  "lt": "Lithuanian",
+  "lv": "Latvia",
+  "mai": "Maithili",
+  "mal": "Malayalam",
+  "mni": "Manipuri",
+  "mr": "Marathi",
+  "ms": "Malay",
+  "nb": "Norwegian Bokmål",
+  "ne": "Nepali",
+  "nl": "Dutch",
+  "od": "Odia",
+  "pa": "Punjabi",
+  "pl": "Polish",
+  "pt": "Portuguese",
+  "ro": "Romanian",
+  "ru": "Russian",
+  "sa": "Sanskrit",
+  "sat": "Santali",
+  "sd": "Sindhi",
+  "si": "Sinhala",
+  "sk": "Slovak",
+  "sl": "Slovenian",
+  "sq": "Albanese",
+  "sr": "Serbian",
+  "su": "Sundanese",
+  "sv": "Swedish",
+  "sw": "Swahili",
+  "ta": "Tamil",
+  "te": "Telugu",
+  "th": "Thai",
+  "tr": "Turkish",
+  "uk": "Ukrainian",
+  "ur": "Urdu",
+  "vi": "Vietnamese",
+  "wuu": "Shanghainese"
+}
\ No newline at end of file
diff --git a/src/modules/browser/speech-synthesis/store-module-browser.tsx b/src/modules/browser/speech-synthesis/store-module-browser.tsx
new file mode 100644
index 000000000..434c9c359
--- /dev/null
+++ b/src/modules/browser/speech-synthesis/store-module-browser.tsx
@@ -0,0 +1,40 @@
+import { create } from 'zustand';
+import { persist } from 'zustand/middleware';
+import { useShallow } from 'zustand/react/shallow';
+
+export type BrowsePageTransform = 'html' | 'text' | 'markdown';
+
+interface BrowseState {
+
+  languageCodeForFilter: string;
+  browseVoiceId: string;
+  setBrowseVoiceId: (value: string) => void;
+  setLanguageCodeForFilter: (value: string) => void;
+
+}
+
+export const useBrowseStore = create<BrowseState>()(
+  persist(
+    (set) => ({
+      languageCodeForFilter: '',
+      browseVoiceId: '',
+      setBrowseVoiceId: (browseVoiceId: string) => set(() => ({ browseVoiceId })),
+      setLanguageCodeForFilter: (languageCodeForFilter: string) => set(() => ({ languageCodeForFilter })),
+    }),
+    {
+      name: 'app-module-browse',
+    },
+  ),
+);
+
+export function useBrowseVoiceId(): [string, (value: string) => void] {
+  return useBrowseStore(useShallow(state => [state.browseVoiceId, state.setBrowseVoiceId]))
+}
+
+export function useLanguageCodeForFilter(): [string, (value: string) => void] {
+  return useBrowseStore(useShallow(state => [state.languageCodeForFilter, state.setLanguageCodeForFilter]))
+}
+
+export function getBrowseVoiceId() {
+  return useBrowseStore.getState().browseVoiceId
+}
\ No newline at end of file
diff --git a/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx b/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx
new file mode 100644
index 000000000..9db7d462e
--- /dev/null
+++ b/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx
@@ -0,0 +1,124 @@
+import * as React from 'react';
+
+import { CircularProgress, Option, Select } from '@mui/joy';
+import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
+import RecordVoiceOverTwoToneIcon from '@mui/icons-material/RecordVoiceOverTwoTone';
+
+import { useBrowseVoiceId } from './store-module-browser';
+import { speakText, cancel } from './browser.speechSynthesis.client';
+
+function VoicesDropdown(props: {
+  isValidKey: boolean;
+  isFetchingVoices: boolean;
+  isErrorVoices: boolean;
+  disabled?: boolean;
+  voices: SpeechSynthesisVoice[];
+  voiceId: string;
+  setVoiceId: (voiceId: string) => void;
+}) {
+  const handleVoiceChange = (_event: any, value: string | null) => props.setVoiceId(value === null ? '' : value);
+
+  return (
+    <Select
+      value={props.voiceId}
+      onChange={handleVoiceChange}
+      variant="outlined"
+      disabled={props.disabled || !props.voices.length}
+      // color={props.isErrorVoices ? 'danger' : undefined}
+      placeholder={props.voices.length === 0 ? 'No voice available' : 'Select a voice'}
+      startDecorator={<RecordVoiceOverTwoToneIcon />}
+      endDecorator={props.isValidKey && props.isFetchingVoices && <CircularProgress size="sm" />}
+      indicator={<KeyboardArrowDownIcon />}
+      slotProps={{
+        root: { sx: { width: '100%' } },
+        indicator: { sx: { opacity: 0.5 } },
+      }}
+    >
+      {props.voices.map((voice, index) => (
+        <Option key={voice.name} value={voice.name}>
+          {voice.name}
+        </Option>
+      ))}
+    </Select>
+  );
+}
+
+function allVoicesObtained(): Promise<SpeechSynthesisVoice[]> {
+  return new Promise(function (resolve, reject) {
+    let voices = window.speechSynthesis.getVoices();
+    if (voices.length !== 0) {
+      resolve(voices);
+    } else {
+      window.speechSynthesis.addEventListener('voiceschanged', function () {
+        voices = window.speechSynthesis.getVoices();
+        resolve(voices);
+      });
+    }
+  });
+}
+
+export function useBrowserSpeechVoices() {
+  const [voices, setVoices] = React.useState<SpeechSynthesisVoice[]>([]);
+
+  React.useEffect(() => {
+    allVoicesObtained().then((data) => setVoices(data));
+  }, []);
+
+  return {
+    hasVoices: voices.length > 0,
+    voices: voices || [],
+  };
+}
+
+export function useBrowserSpeechVoiceDropdown(
+  autoSpeak: boolean,
+  {
+    disabled,
+    voiceNameFilters,
+    testUtterance,
+  }: {
+    disabled?: boolean;
+    voiceNameFilters?: string[] | null;
+    testUtterance?: string | null;
+  },
+) {
+  // external state
+  const { hasVoices, voices } = useBrowserSpeechVoices();
+  const [voiceId, setVoiceId] = useBrowseVoiceId();
+
+  // derived state
+  const voice = voices.find((voice) => voiceId === voice.name);
+  const voiceFiltered = voiceNameFilters ? voices.filter((voice) => voiceNameFilters.includes(voice.name)) : voices;
+
+  // [E] autoSpeak
+  React.useEffect(() => {
+    if (autoSpeak && voice && voiceFiltered.includes(voice)) {
+      speakText(testUtterance ? testUtterance.replace('{name}', voice.name) : `How can I assist you today?`, String(voiceId));
+    }
+    return () => {
+      cancel();
+    };
+  }, [autoSpeak, testUtterance, voice, voiceFiltered, voiceId, voiceNameFilters]);
+
+  const voicesDropdown = React.useMemo(
+    () => (
+      <VoicesDropdown
+        isValidKey={true}
+        isFetchingVoices={false}
+        isErrorVoices={false}
+        disabled={disabled}
+        voices={voiceFiltered}
+        voiceId={voiceId}
+        setVoiceId={setVoiceId}
+      />
+    ),
+    [disabled, setVoiceId, voiceFiltered, voiceId],
+  );
+
+  return {
+    hasVoices,
+    voiceId,
+    voiceName: voice?.name,
+    voicesDropdown,
+  };
+}
diff --git a/src/modules/elevenlabs/ElevenlabsSettings.tsx b/src/modules/elevenlabs/ElevenlabsSettings.tsx
index 51b07db94..97ebc64b3 100644
--- a/src/modules/elevenlabs/ElevenlabsSettings.tsx
+++ b/src/modules/elevenlabs/ElevenlabsSettings.tsx
@@ -5,7 +5,7 @@ import { FormControl } from '@mui/joy';
 import { AlreadySet } from '~/common/components/AlreadySet';
 import { FormInputKey } from '~/common/components/forms/FormInputKey';
 import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
-import { useCapabilityElevenLabs } from '~/common/components/useCapabilities';
+import { useVoiceCapability } from '~/common/components/useCapabilities';
 
 import { isElevenLabsEnabled } from './elevenlabs.client';
 import { useElevenLabsVoiceDropdown } from './useElevenLabsVoiceDropdown';
@@ -16,7 +16,7 @@ export function ElevenlabsSettings() {
 
   // external state
   const [apiKey, setApiKey] = useElevenLabsApiKey();
-  const { isConfiguredServerSide } = useCapabilityElevenLabs();
+  const { isConfiguredServerSide } = useVoiceCapability();
   const { voicesDropdown } = useElevenLabsVoiceDropdown(true);
 
 
diff --git a/src/modules/elevenlabs/elevenlabs.client.ts b/src/modules/elevenlabs/elevenlabs.client.ts
index 7145cbdb1..9e7e5ed09 100644
--- a/src/modules/elevenlabs/elevenlabs.client.ts
+++ b/src/modules/elevenlabs/elevenlabs.client.ts
@@ -2,7 +2,7 @@ import { getBackendCapabilities } from '~/modules/backend/store-backend-capabili
 
 import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer';
 import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
-import { CapabilityElevenLabsSpeechSynthesis } from '~/common/components/useCapabilities';
+import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities';
 import { frontendSideFetch } from '~/common/util/clientFetchers';
 import { useUIPreferencesStore } from '~/common/state/store-ui';
 
@@ -17,7 +17,7 @@ export const isElevenLabsEnabled = (apiKey?: string) => apiKey
   : getBackendCapabilities().hasVoiceElevenLabs;
 
 
-export function useCapability(): CapabilityElevenLabsSpeechSynthesis {
+export function useCapability(): CapabilitySpeechSynthesis {
   const [clientApiKey, voiceId] = useElevenLabsData();
   const isConfiguredServerSide = getBackendCapabilities().hasVoiceElevenLabs;
   const isConfiguredClientSide = clientApiKey ? isValidElevenLabsApiKey(clientApiKey) : false;
diff --git a/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx b/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx
index fdfaafe3a..24de0b003 100644
--- a/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx
+++ b/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx
@@ -82,6 +82,10 @@ export function useElevenLabsVoiceDropdown(autoSpeak: boolean, disabled?: boolea
   React.useEffect(() => {
     if (previewUrl)
       void AudioPlayer.playUrl(previewUrl);
+
+    return () => {
+      // TODO: stop audio
+    }
   }, [previewUrl]);
 
   const voicesDropdown = React.useMemo(() =>

From 0d5f6613404ff86ae02bc9092f91f4318ec02279 Mon Sep 17 00:00:00 2001
From: zoollcar <zoollcar@qq.com>
Date: Thu, 24 Oct 2024 21:44:35 +0800
Subject: [PATCH 2/3] Add Types, Remove invalid languages

---
 src/apps/chat/store-app-chat.ts               |  22 ++--
 src/apps/settings-modal/SettingsModal.tsx     |   4 +-
 src/apps/settings-modal/VoiceSettings.tsx     | 123 +++++++++---------
 src/common/components/useVoiceCapabilities.ts |  79 ++++++-----
 .../speech-synthesis/preSelect/Languages.json |  31 -----
 5 files changed, 123 insertions(+), 136 deletions(-)

diff --git a/src/apps/chat/store-app-chat.ts b/src/apps/chat/store-app-chat.ts
index 4760fe161..2a91157ab 100644
--- a/src/apps/chat/store-app-chat.ts
+++ b/src/apps/chat/store-app-chat.ts
@@ -1,7 +1,7 @@
 import { create } from 'zustand';
 import { persist } from 'zustand/middleware';
 import { useShallow } from 'zustand/react/shallow';
-import { ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities';
+import { ASREngineKey, ASREngineList, TTSEngineKey, TTSEngineList } from '~/common/components/useVoiceCapabilities';
 
 import type { DLLMId } from '~/common/stores/llms/llms.types';
 
@@ -52,11 +52,11 @@ interface AppChatStore {
   micTimeoutMs: number;
   setMicTimeoutMs: (micTimeoutMs: number) => void;
 
-  TTSEngine: string;
-  setTTSEngine: (TTSEngine: string) => void;
+  TTSEngine: TTSEngineKey;
+  setTTSEngine: (TTSEngine: TTSEngineKey) => void;
 
-  ASREngine: string;
-  setASREngine: (ASREngine: string) => void;
+  ASREngine: ASREngineKey;
+  setASREngine: (ASREngine: ASREngineKey) => void;
 
   showPersonaIcons: boolean;
   setShowPersonaIcons: (showPersonaIcons: boolean) => void;
@@ -121,11 +121,11 @@ const useAppChatStore = create<AppChatStore>()(persist(
     micTimeoutMs: 2000,
     setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }),
 
-    TTSEngine: TTSEngineList[0],
-    setTTSEngine: (TTSEngine: string) => _set({ TTSEngine }),
+    TTSEngine: TTSEngineList[0].key,
+    setTTSEngine: (TTSEngine: TTSEngineKey) => _set({ TTSEngine }),
 
-    ASREngine: ASREngineList[0],
-    setASREngine: (ASREngine: string) => _set({ ASREngine }),
+    ASREngine: ASREngineList[0].key,
+    setASREngine: (ASREngine: ASREngineKey) => _set({ ASREngine }),
 
     showPersonaIcons: true,
     setShowPersonaIcons: (showPersonaIcons: boolean) => _set({ showPersonaIcons }),
@@ -211,11 +211,11 @@ export const useChatMicTimeoutMsValue = (): number =>
 export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] =>
   useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs]));
 
-export const useTTSEngine = (): [string, (micTimeoutMs: string) => void] =>
+export const useTTSEngine = (): [TTSEngineKey, (TTSEngine: TTSEngineKey) => void] =>
   useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine]));
 export const getTTSEngine = () => useAppChatStore.getState().TTSEngine;
 
-export const useASREngine = (): [string, (micTimeoutMs: string) => void] =>
+export const useASREngine = (): [ASREngineKey, (ASREngine: ASREngineKey) => void] =>
   useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine]));
 
 export const useChatDrawerFilters = () => {
diff --git a/src/apps/settings-modal/SettingsModal.tsx b/src/apps/settings-modal/SettingsModal.tsx
index 2af374cac..feeccb050 100644
--- a/src/apps/settings-modal/SettingsModal.tsx
+++ b/src/apps/settings-modal/SettingsModal.tsx
@@ -198,10 +198,10 @@ export function SettingsModal(props: {
             <Topic icon='🎙️' title='Voice settings'>
               <VoiceSettings />
             </Topic>
-            {TTSEngine === 'Elevenlabs' && <Topic icon='📢' title='ElevenLabs API'>
+            {TTSEngine === 'elevenlabs' && <Topic icon='📢' title='ElevenLabs API'>
               <ElevenlabsSettings />
             </Topic>}
-            {TTSEngine === 'Web Speech API' && <Topic icon='📢' title='Web Speech API'>
+            {TTSEngine === 'webspeech' && <Topic icon='📢' title='Web Speech API'>
               <BrowserSpeechSettings />
             </Topic>}
           </Topics>
diff --git a/src/apps/settings-modal/VoiceSettings.tsx b/src/apps/settings-modal/VoiceSettings.tsx
index 97712ad2c..fcc9725d3 100644
--- a/src/apps/settings-modal/VoiceSettings.tsx
+++ b/src/apps/settings-modal/VoiceSettings.tsx
@@ -4,77 +4,82 @@ import { FormControl } from '@mui/joy';
 
 import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat';
 
-
-
 import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
 import { FormRadioControl } from '~/common/components/forms/FormRadioControl';
 import { LanguageSelect } from '~/common/components/LanguageSelect';
 import { useIsMobile } from '~/common/components/useMatchMedia';
-import { hasVoices, ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities';
+import { hasVoices, ASREngineList, TTSEngineList, TTSEngineKey } from '~/common/components/useVoiceCapabilities';
 
 export function VoiceSettings() {
-
   // external state
   const isMobile = useIsMobile();
   const { autoSpeak, setAutoSpeak } = useChatAutoAI();
 
-  const [chatTimeoutMs, setChatTimeoutMs]  = useChatMicTimeoutMs();
-  const [TTSEngine, setTTSEngine ] = useTTSEngine();
-  const [ASREngine, setASREngine ] = useASREngine();
+  const [chatTimeoutMs, setChatTimeoutMs] = useChatMicTimeoutMs();
+  const [TTSEngine, setTTSEngine] = useTTSEngine();
+  const [ASREngine, setASREngine] = useASREngine();
 
   // this converts from string keys to numbers and vice versa
   const chatTimeoutValue: string = '' + chatTimeoutMs;
   const setChatTimeoutValue = (value: string) => value && setChatTimeoutMs(parseInt(value));
 
-  return <>
-
-    {/* LanguageSelect: moved from the UI settings (where it logically belongs), just to group things better from an UX perspective */}
-    <FormControl orientation='horizontal' sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
-      <FormLabelStart title='Language'
-                      description='ASR and TTS'
-                      tooltip='Currently for Microphone input and Voice output. Microphone support varies by browser (iPhone/Safari lacks speech input). We will use the ElevenLabs MultiLanguage model if a language other than English is selected.' />
-      <LanguageSelect />
-    </FormControl>
-
-    {!isMobile && <FormRadioControl
-      title='Mic Timeout'
-      description={chatTimeoutMs < 1000 ? 'Best for quick calls' : chatTimeoutMs > 5000 ? 'Best for thinking' : 'Standard'}
-      options={[
-        { value: '600', label: '.6s' },
-        { value: '2000', label: '2s' },
-        { value: '15000', label: '15s' },
-      ]}
-      value={chatTimeoutValue} onChange={setChatTimeoutValue}
-    />}
-
-    <FormRadioControl
-      title='Speak Responses'
-      description={autoSpeak === 'off' ? 'Off' : 'First paragraph'}
-      tooltip={!hasVoices ? 'No voices available, please configure a voice synthesis service' : undefined}
-      disabled={!hasVoices}
-      options={[
-        { value: 'off', label: 'Off' },
-        { value: 'firstLine', label: 'Start' },
-        { value: 'all', label: 'Full' },
-      ]}
-      value={autoSpeak} onChange={setAutoSpeak}
-    />
-
-    <FormRadioControl
-      title='TTS engine'
-      description='Text to speech'
-      tooltip=''
-      options={TTSEngineList.map((i) => ({ value: i, label: i }))}
-      value={TTSEngine} onChange={setTTSEngine}
-    />
-
-    <FormRadioControl
-      title='ASR engine'
-      description='Automatic Speech Recognition'
-      tooltip=''
-      options={ASREngineList.map((i) => ({ value: i, label: i }))}
-      value={ASREngine} onChange={setASREngine}
-    />
-
-  </>;
-}
\ No newline at end of file
+  return (
+    <>
+      {/* LanguageSelect: moved from the UI settings (where it logically belongs), just to group things better from an UX perspective */}
+      <FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
+        <FormLabelStart
+          title="Language"
+          description="ASR and TTS"
+          tooltip="Currently for Microphone input and Voice output. Microphone support varies by browser (iPhone/Safari lacks speech input). We will use the ElevenLabs MultiLanguage model if a language other than English is selected."
+        />
+        <LanguageSelect />
+      </FormControl>
+
+      {!isMobile && (
+        <FormRadioControl
+          title="Mic Timeout"
+          description={chatTimeoutMs < 1000 ? 'Best for quick calls' : chatTimeoutMs > 5000 ? 'Best for thinking' : 'Standard'}
+          options={[
+            { value: '600', label: '.6s' },
+            { value: '2000', label: '2s' },
+            { value: '15000', label: '15s' },
+          ]}
+          value={chatTimeoutValue}
+          onChange={setChatTimeoutValue}
+        />
+      )}
+
+      <FormRadioControl
+        title="Speak Responses"
+        description={autoSpeak === 'off' ? 'Off' : 'First paragraph'}
+        tooltip={!hasVoices ? 'No voices available, please configure a voice synthesis service' : undefined}
+        disabled={!hasVoices}
+        options={[
+          { value: 'off', label: 'Off' },
+          { value: 'firstLine', label: 'Start' },
+          { value: 'all', label: 'Full' },
+        ]}
+        value={autoSpeak}
+        onChange={setAutoSpeak}
+      />
+
+      <FormRadioControl
+        title="TTS engine"
+        description="Text to speech"
+        tooltip=""
+        options={TTSEngineList.map((i) => ({ value: i.key, label: i.label }))}
+        value={TTSEngine}
+        onChange={setTTSEngine}
+      />
+
+      <FormRadioControl
+        title="ASR engine"
+        description="Automatic Speech Recognition"
+        tooltip=""
+        options={ASREngineList.map((i) => ({ value: i.key, label: i.label }))}
+        value={ASREngine}
+        onChange={setASREngine}
+      />
+    </>
+  );
+}
diff --git a/src/common/components/useVoiceCapabilities.ts b/src/common/components/useVoiceCapabilities.ts
index ad1cc253a..ecfc37442 100644
--- a/src/common/components/useVoiceCapabilities.ts
+++ b/src/common/components/useVoiceCapabilities.ts
@@ -1,47 +1,60 @@
 import { getTTSEngine } from 'src/apps/chat/store-app-chat';
 import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities';
 
-import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client'
-import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client'
-import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client'
+import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client';
+import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client';
+import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client';
 
-import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
-import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
-import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'
+import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client';
+import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client';
+import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client';
 
 import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
 import { useBrowserSpeechVoices } from '~/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown';
 
-export const TTSEngineList: string[] = [
-  'Elevenlabs',
-  'Web Speech API'
-]
+export type TTSEngineKey = 'elevenlabs' | 'webspeech';
+export type ASREngineKey = 'webspeech';
 
-export const ASREngineList: string[] = [
-  'Web Speech API'
-]
+export const TTSEngineList: { key: TTSEngineKey; label: string }[] = [
+  {
+    key: 'elevenlabs',
+    label: 'ElevenLabs',
+  },
+  {
+    key: 'webspeech',
+    label: 'Web Speech API',
+  },
+];
 
-export function getConditionalVoices(){
+export const ASREngineList: { key: ASREngineKey; label: string }[] = [
+  {
+    key: 'webspeech',
+    label: 'Web Speech API',
+  },
+];
+
+export function getConditionalVoices() {
   const TTSEngine = getTTSEngine();
-  if (TTSEngine === 'Elevenlabs') {
-    return useElevenLabsVoices
-  }else if (TTSEngine === 'Web Speech API') {
-    return useBrowserSpeechVoices
+  if (TTSEngine === 'elevenlabs') {
+    return useElevenLabsVoices;
+  }
+  if (TTSEngine === 'webspeech') {
+    return useBrowserSpeechVoices;
   }
-  throw new Error('TTSEngine is not found');
 }
 
 export function hasVoices(): boolean {
-  console.log('getConditionalVoices', getConditionalVoices()().hasVoices)
+  console.log('getConditionalVoices', getConditionalVoices()().hasVoices);
   return getConditionalVoices()().hasVoices;
-} 
+}
 
 export function getConditionalCapability(): () => CapabilitySpeechSynthesis {
   const TTSEngine = getTTSEngine();
-  if (TTSEngine === 'Elevenlabs') {
-    return useElevenlabsCapability
-  }else if (TTSEngine === 'Web Speech API') {
-    return useBrowserSpeechSynthesisCapability
+  if (TTSEngine === 'elevenlabs') {
+    return useElevenlabsCapability;
+  }
+  if (TTSEngine === 'webspeech') {
+    return useBrowserSpeechSynthesisCapability;
   }
   throw new Error('TTSEngine is not found');
 }
@@ -50,25 +63,25 @@ export function useCapability(): CapabilitySpeechSynthesis {
   return getConditionalCapability()();
 }
 
-
 export async function speakText(text: string, voiceId?: string) {
   const TTSEngine = getTTSEngine();
-  if (TTSEngine === 'Elevenlabs') {
+  if (TTSEngine === 'elevenlabs') {
     return await elevenlabsSpeakText(text, voiceId);
-  }else if (TTSEngine === 'Web Speech API') {
+  } 
+  if (TTSEngine === 'webspeech') {
     return await browserSpeechSynthesisSpeakText(text, voiceId);
   }
-  throw new Error('TTSEngine is not found'); 
 }
 
 // let liveAudioPlayer: LiveAudioPlayer | undefined = undefined;
 
 export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
   const TTSEngine = getTTSEngine();
-  if (TTSEngine === 'Elevenlabs') {
+  if (TTSEngine === 'elevenlabs') {
     return await EXPERIMENTAL_elevenlabsSpeakTextStream(text, voiceId);
-  }else if (TTSEngine === 'Web Speech API') {
+  }
+  if (TTSEngine === 'webspeech') {
     return await EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream(text, voiceId);
   }
-  throw new Error('TTSEngine is not found'); 
-}
\ No newline at end of file
+  throw new Error('TTSEngine is not found');
+}
diff --git a/src/modules/browser/speech-synthesis/preSelect/Languages.json b/src/modules/browser/speech-synthesis/preSelect/Languages.json
index a2b9ade45..69c3edec8 100644
--- a/src/modules/browser/speech-synthesis/preSelect/Languages.json
+++ b/src/modules/browser/speech-synthesis/preSelect/Languages.json
@@ -1,75 +1,44 @@
 {
   "ar": "Arabic",
-  "as": "Assamese",
-  "bg": "Bulgarian",
   "bho": "Bhojpuri",
   "bn": "Bangla",
-  "brx": "Bodo",
-  "bs": "Bosnian",
   "ca": "Catalan",
   "cmn": "Chinese",
   "cs": "Czech",
-  "cy": "Welsh",
   "da": "Danish",
   "de": "German",
-  "doi": "Dogri",
   "el": "Greek",
   "en": "English",
   "es": "Spanish",
-  "et": "Estonian",
   "eu": "Basque",
   "fa": "Persian",
   "fi": "Finnish",
-  "fil": "Filipino",
   "fr": "French",
   "gl": "Galician",
-  "gu": "Gujarati",
   "he": "Hebrew",
   "hi": "Hindi",
   "hr": "Croatian",
   "hu": "Hungarian",
   "id": "Indonesian",
-  "is": "Icelandic",
   "it": "Italian",
   "ja": "Japanese",
-  "jv": "Javanese",
-  "km": "khmer",
-  "kn": "Kannada",
-  "kok": "Konkani",
   "ko": "Korean",
-  "lt": "Lithuanian",
-  "lv": "Latvia",
-  "mai": "Maithili",
-  "mal": "Malayalam",
-  "mni": "Manipuri",
   "mr": "Marathi",
   "ms": "Malay",
   "nb": "Norwegian Bokmål",
-  "ne": "Nepali",
   "nl": "Dutch",
-  "od": "Odia",
-  "pa": "Punjabi",
   "pl": "Polish",
   "pt": "Portuguese",
   "ro": "Romanian",
   "ru": "Russian",
-  "sa": "Sanskrit",
-  "sat": "Santali",
-  "sd": "Sindhi",
-  "si": "Sinhala",
   "sk": "Slovak",
   "sl": "Slovenian",
-  "sq": "Albanese",
-  "sr": "Serbian",
-  "su": "Sundanese",
   "sv": "Swedish",
-  "sw": "Swahili",
   "ta": "Tamil",
   "te": "Telugu",
   "th": "Thai",
   "tr": "Turkish",
   "uk": "Ukrainian",
-  "ur": "Urdu",
   "vi": "Vietnamese",
   "wuu": "Shanghainese"
 }
\ No newline at end of file

From a538cc195a19924d48fc953707b1a1d39e93fdb1 Mon Sep 17 00:00:00 2001
From: zoollcar <zoollcar@qq.com>
Date: Fri, 25 Oct 2024 23:22:51 +0800
Subject: [PATCH 3/3] Abstract TTS module

---
 app/api/elevenlabs/speech/route.ts            |   2 +-
 pages/info/debug.tsx                          |   5 +-
 src/apps/call/CallWizard.tsx                  |   6 +-
 src/apps/call/Telephone.tsx                   |  14 +--
 src/apps/chat/AppChat.tsx                     |   2 +-
 src/apps/chat/components/ChatMessageList.tsx  |   5 +-
 .../persona/PersonaChatMessageSpeak.ts        |   2 +-
 src/apps/chat/store-app-chat.ts               |  12 +-
 src/apps/settings-modal/SettingsModal.tsx     |  16 +--
 src/apps/settings-modal/VoiceSettings.tsx     |  74 ++++++++----
 src/common/components/useCapabilities.ts      |  12 --
 src/common/components/useVoiceCapabilities.ts |  87 --------------
 src/modules/asr/asr.client.ts                 |   8 ++
 .../browser.speechSynthesis.client.ts         |  48 --------
 src/modules/elevenlabs/elevenlabs.client.ts   |  98 ----------------
 src/modules/tts/tts.client.hooks.ts           |  11 ++
 src/modules/tts/tts.client.ts                 |  41 +++++++
 src/modules/tts/tts.setting.tsx               |  11 ++
 src/modules/tts/useTTSStore.ts                |  34 ++++++
 src/modules/tts/vendors/ISpeechSynthesis.ts   |  30 +++++
 .../elevenlabs/ElevenlabsSettings.tsx         |   5 +-
 .../vendors}/elevenlabs/elevenlabs.router.ts  |   0
 .../vendors}/elevenlabs/elevenlabs.server.ts  |   0
 .../vendors/elevenlabs/elevenlabs.vendor.ts   | 107 ++++++++++++++++++
 .../elevenlabs/store-module-elevenlabs.ts     |   0
 .../elevenlabs/useElevenLabsVoiceDropdown.tsx |   2 +-
 src/modules/tts/vendors/vendors.registry.ts   |  19 ++++
 .../vendors/webspeech/WebspeechSettings.tsx}  |   6 +-
 .../webspeech}/preSelect/Languages.json       |   0
 .../webspeech/store-module-webspeech.ts}      |   0
 .../webspeech/useWebspeechVoiceDropdown.tsx}  |   4 +-
 .../tts/vendors/webspeech/webspeech.vendor.ts |  65 +++++++++++
 src/server/api/trpc.router-edge.ts            |   2 +-
 33 files changed, 417 insertions(+), 311 deletions(-)
 delete mode 100644 src/common/components/useVoiceCapabilities.ts
 create mode 100644 src/modules/asr/asr.client.ts
 delete mode 100644 src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts
 delete mode 100644 src/modules/elevenlabs/elevenlabs.client.ts
 create mode 100644 src/modules/tts/tts.client.hooks.ts
 create mode 100644 src/modules/tts/tts.client.ts
 create mode 100644 src/modules/tts/tts.setting.tsx
 create mode 100644 src/modules/tts/useTTSStore.ts
 create mode 100644 src/modules/tts/vendors/ISpeechSynthesis.ts
 rename src/modules/{ => tts/vendors}/elevenlabs/ElevenlabsSettings.tsx (87%)
 rename src/modules/{ => tts/vendors}/elevenlabs/elevenlabs.router.ts (100%)
 rename src/modules/{ => tts/vendors}/elevenlabs/elevenlabs.server.ts (100%)
 create mode 100644 src/modules/tts/vendors/elevenlabs/elevenlabs.vendor.ts
 rename src/modules/{ => tts/vendors}/elevenlabs/store-module-elevenlabs.ts (100%)
 rename src/modules/{ => tts/vendors}/elevenlabs/useElevenLabsVoiceDropdown.tsx (98%)
 create mode 100644 src/modules/tts/vendors/vendors.registry.ts
 rename src/modules/{browser/speech-synthesis/BrowserSpeechSettings.tsx => tts/vendors/webspeech/WebspeechSettings.tsx} (95%)
 rename src/modules/{browser/speech-synthesis => tts/vendors/webspeech}/preSelect/Languages.json (100%)
 rename src/modules/{browser/speech-synthesis/store-module-browser.tsx => tts/vendors/webspeech/store-module-webspeech.ts} (100%)
 rename src/modules/{browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx => tts/vendors/webspeech/useWebspeechVoiceDropdown.tsx} (96%)
 create mode 100644 src/modules/tts/vendors/webspeech/webspeech.vendor.ts

diff --git a/app/api/elevenlabs/speech/route.ts b/app/api/elevenlabs/speech/route.ts
index d7a8d6e7b..7a9f21678 100644
--- a/app/api/elevenlabs/speech/route.ts
+++ b/app/api/elevenlabs/speech/route.ts
@@ -1,2 +1,2 @@
 export const runtime = 'edge';
-export { elevenLabsHandler as POST } from '~/modules/elevenlabs/elevenlabs.server';
\ No newline at end of file
+export { elevenLabsHandler as POST } from '~/modules/tts/vendors/elevenlabs/elevenlabs.server';
\ No newline at end of file
diff --git a/pages/info/debug.tsx b/pages/info/debug.tsx
index 5d390020d..8b27709b9 100644
--- a/pages/info/debug.tsx
+++ b/pages/info/debug.tsx
@@ -18,7 +18,8 @@ import { ROUTE_APP_CHAT, ROUTE_INDEX } from '~/common/app.routes';
 import { Release } from '~/common/app.release';
 
 // capabilities access
-import { useCapabilityBrowserSpeechRecognition, useVoiceCapability, useCapabilityTextToImage } from '~/common/components/useCapabilities';
+import { useCapabilityBrowserSpeechRecognition, useCapabilityTextToImage } from '~/common/components/useCapabilities';
+import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
 
 // stores access
 import { getLLMsDebugInfo } from '~/common/stores/llms/store-llms';
@@ -95,7 +96,7 @@ function AppDebug() {
   const cProduct = {
     capabilities: {
       mic: useCapabilityBrowserSpeechRecognition(),
-      elevenLabs: useVoiceCapability(),
+      elevenLabs: useTTSCapability(),
       textToImage: useCapabilityTextToImage(),
     },
     models: getLLMsDebugInfo(),
diff --git a/src/apps/call/CallWizard.tsx b/src/apps/call/CallWizard.tsx
index d7bdb767a..190f76ab4 100644
--- a/src/apps/call/CallWizard.tsx
+++ b/src/apps/call/CallWizard.tsx
@@ -12,11 +12,13 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded';
 import { animationColorRainbow } from '~/common/util/animUtils';
 import { navigateBack } from '~/common/app.routes';
 import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
-import { useCapabilityBrowserSpeechRecognition, useVoiceCapability } from '~/common/components/useCapabilities';
+import { useCapabilityBrowserSpeechRecognition } from '~/common/components/useCapabilities';
+import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
 import { useChatStore } from '~/common/stores/chat/store-chats';
 import { useUICounter } from '~/common/state/store-ui';
 
 
+
 function StatusCard(props: { icon: React.JSX.Element, hasIssue: boolean, text: string, button?: React.JSX.Element }) {
   return (
     <Card sx={{ width: '100%' }}>
@@ -45,7 +47,7 @@ export function CallWizard(props: { strict?: boolean, conversationId: string | n
 
   // external state
   const recognition = useCapabilityBrowserSpeechRecognition();
-  const synthesis = useVoiceCapability();
+  const synthesis = useTTSCapability();
   const chatIsEmpty = useChatStore(state => {
     if (!props.conversationId)
       return false;
diff --git a/src/apps/call/Telephone.tsx b/src/apps/call/Telephone.tsx
index d3ffddd92..72c728515 100644
--- a/src/apps/call/Telephone.tsx
+++ b/src/apps/call/Telephone.tsx
@@ -13,10 +13,10 @@ import { ScrollToBottom } from '~/common/scroll-to-bottom/ScrollToBottom';
 import { ScrollToBottomButton } from '~/common/scroll-to-bottom/ScrollToBottomButton';
 import { useChatLLMDropdown } from '../chat/components/layout-bar/useLLMDropdown';
 
-import { EXPERIMENTAL_speakTextStream } from '~/common/components/useVoiceCapabilities';
+import { EXPERIMENTAL_speakTextStream } from '~/modules/tts/tts.client';
 import { SystemPurposeId, SystemPurposes } from '../../data';
 import { llmStreamingChatGenerate, VChatMessageIn } from '~/modules/llms/llm.client';
-import { useElevenLabsVoiceDropdown } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
+import { TTSSetting } from '~/modules/tts/tts.setting';
 
 import type { OptimaBarControlMethods } from '~/common/layout/optima/bar/OptimaBarDropdown';
 import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
@@ -39,6 +39,7 @@ import { CallStatus } from './components/CallStatus';
 import { useAppCallStore } from './state/store-app-call';
 
 
+
 function CallMenuItems(props: {
   pushToTalk: boolean,
   setPushToTalk: (pushToTalk: boolean) => void,
@@ -48,8 +49,7 @@ function CallMenuItems(props: {
 
   // external state
   const { grayUI, toggleGrayUI } = useAppCallStore();
-  const { voicesDropdown } = useElevenLabsVoiceDropdown(false, !props.override);
-
+  
   const handlePushToTalkToggle = () => props.setPushToTalk(!props.pushToTalk);
 
   const handleChangeVoiceToggle = () => props.setOverride(!props.override);
@@ -68,10 +68,10 @@ function CallMenuItems(props: {
       <Switch checked={props.override} onChange={handleChangeVoiceToggle} sx={{ ml: 'auto' }} />
     </MenuItem>
 
-    <MenuItem>
-      <ListItemDecorator>{' '}</ListItemDecorator>
-      {voicesDropdown}
+    <MenuItem sx={{flexWrap: 'wrap'}}>
+      <TTSSetting />
     </MenuItem>
+    
 
     <ListDivider />
 
diff --git a/src/apps/chat/AppChat.tsx b/src/apps/chat/AppChat.tsx
index 98f4837da..a21dd584e 100644
--- a/src/apps/chat/AppChat.tsx
+++ b/src/apps/chat/AppChat.tsx
@@ -10,7 +10,7 @@ import { FlattenerModal } from '~/modules/aifn/flatten/FlattenerModal';
 import { TradeConfig, TradeModal } from '~/modules/trade/TradeModal';
 import { downloadSingleChat, importConversationsFromFilesAtRest, openConversationsAtRestPicker } from '~/modules/trade/trade.client';
 import { imaginePromptFromTextOrThrow } from '~/modules/aifn/imagine/imaginePromptFromText';
-import { speakText } from '~/common/components/useVoiceCapabilities';
+import { speakText } from '~/modules/tts/tts.client';
 import { useAreBeamsOpen } from '~/modules/beam/store-beam.hooks';
 import { useCapabilityTextToImage } from '~/modules/t2i/t2i.client';
 
diff --git a/src/apps/chat/components/ChatMessageList.tsx b/src/apps/chat/components/ChatMessageList.tsx
index 22f62f9c9..2b0df25fe 100644
--- a/src/apps/chat/components/ChatMessageList.tsx
+++ b/src/apps/chat/components/ChatMessageList.tsx
@@ -19,7 +19,7 @@ import { getConversation, useChatStore } from '~/common/stores/chat/store-chats'
 import { openFileForAttaching } from '~/common/components/ButtonAttachFiles';
 import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
 import { useBrowserTranslationWarning } from '~/common/components/useIsBrowserTranslating';
-import { useVoiceCapability } from '~/common/components/useCapabilities';
+import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
 import { useChatOverlayStore } from '~/common/chat-overlay/store-perchat_vanilla';
 import { useScrollToBottom } from '~/common/scroll-to-bottom/useScrollToBottom';
 
@@ -30,6 +30,7 @@ import { PersonaSelector } from './persona-selector/PersonaSelector';
 import { useChatAutoSuggestHTMLUI, useChatShowSystemMessages } from '../store-app-chat';
 
 
+
 const stableNoMessages: DMessage[] = [];
 
 /**
@@ -75,7 +76,7 @@ export function ChatMessageList(props: {
     _composerInReferenceToCount: state.inReferenceTo?.length ?? 0,
     ephemerals: state.ephemerals?.length ? state.ephemerals : null,
   })));
-  const { mayWork: isSpeakable } = useVoiceCapability();
+  const { mayWork: isSpeakable } = useTTSCapability();
 
   // derived state
   const { conversationHandler, conversationId, capabilityHasT2I, onConversationBranch, onConversationExecuteHistory, onTextDiagram, onTextImagine, onTextSpeak } = props;
diff --git a/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts b/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts
index a016af1da..6c753f771 100644
--- a/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts
+++ b/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts
@@ -1,4 +1,4 @@
-import { speakText } from '~/modules/elevenlabs/elevenlabs.client';
+import { speakText } from '~/modules/tts/tts.client';
 
 import { isTextContentFragment } from '~/common/stores/chat/chat.fragments';
 
diff --git a/src/apps/chat/store-app-chat.ts b/src/apps/chat/store-app-chat.ts
index 2a91157ab..8a723af0c 100644
--- a/src/apps/chat/store-app-chat.ts
+++ b/src/apps/chat/store-app-chat.ts
@@ -1,9 +1,9 @@
 import { create } from 'zustand';
 import { persist } from 'zustand/middleware';
 import { useShallow } from 'zustand/react/shallow';
-import { ASREngineKey, ASREngineList, TTSEngineKey, TTSEngineList } from '~/common/components/useVoiceCapabilities';
 
 import type { DLLMId } from '~/common/stores/llms/llms.types';
+import { ASREngineKey, ASREngineList } from '~/modules/asr/asr.client';
 
 
 export type ChatAutoSpeakType = 'off' | 'firstLine' | 'all';
@@ -52,9 +52,6 @@ interface AppChatStore {
   micTimeoutMs: number;
   setMicTimeoutMs: (micTimeoutMs: number) => void;
 
-  TTSEngine: TTSEngineKey;
-  setTTSEngine: (TTSEngine: TTSEngineKey) => void;
-
   ASREngine: ASREngineKey;
   setASREngine: (ASREngine: ASREngineKey) => void;
 
@@ -121,9 +118,6 @@ const useAppChatStore = create<AppChatStore>()(persist(
     micTimeoutMs: 2000,
     setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }),
 
-    TTSEngine: TTSEngineList[0].key,
-    setTTSEngine: (TTSEngine: TTSEngineKey) => _set({ TTSEngine }),
-
     ASREngine: ASREngineList[0].key,
     setASREngine: (ASREngine: ASREngineKey) => _set({ ASREngine }),
 
@@ -211,10 +205,6 @@ export const useChatMicTimeoutMsValue = (): number =>
 export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] =>
   useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs]));
 
-export const useTTSEngine = (): [TTSEngineKey, (TTSEngine: TTSEngineKey) => void] =>
-  useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine]));
-export const getTTSEngine = () => useAppChatStore.getState().TTSEngine;
-
 export const useASREngine = (): [ASREngineKey, (ASREngine: ASREngineKey) => void] =>
   useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine]));
 
diff --git a/src/apps/settings-modal/SettingsModal.tsx b/src/apps/settings-modal/SettingsModal.tsx
index 3dfe68256..fbecf1822 100644
--- a/src/apps/settings-modal/SettingsModal.tsx
+++ b/src/apps/settings-modal/SettingsModal.tsx
@@ -9,7 +9,6 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded';
 
 import { BrowseSettings } from '~/modules/browse/BrowseSettings';
 import { DallESettings } from '~/modules/t2i/dalle/DallESettings';
-import { ElevenlabsSettings } from '~/modules/elevenlabs/ElevenlabsSettings';
 import { GoogleSearchSettings } from '~/modules/google/GoogleSearchSettings';
 import { ProdiaSettings } from '~/modules/t2i/prodia/ProdiaSettings';
 import { T2ISettings } from '~/modules/t2i/T2ISettings';
@@ -22,9 +21,9 @@ import { AppChatSettingsAI } from './AppChatSettingsAI';
 import { AppChatSettingsUI } from './settings-ui/AppChatSettingsUI';
 import { UxLabsSettings } from './UxLabsSettings';
 import { VoiceSettings } from './VoiceSettings';
-import { BrowserSpeechSettings } from '~/modules/browser/speech-synthesis/BrowserSpeechSettings';
-
-import { useTTSEngine } from 'src/apps/chat/store-app-chat';
+import { useTTSEngine } from '~/modules/tts/useTTSStore';
+import { TTSSetting } from '~/modules/tts/tts.setting';
+import { getName as getTTSEngineName } from '~/modules/tts/tts.client';
 
 
 // styled <AccordionGroup variant='plain'> into a Topics component
@@ -198,12 +197,9 @@ export function SettingsModal(props: {
             <Topic icon='🎙️' title='Voice settings'>
               <VoiceSettings />
             </Topic>
-            {TTSEngine === 'elevenlabs' && <Topic icon='📢' title='ElevenLabs API'>
-              <ElevenlabsSettings />
-            </Topic>}
-            {TTSEngine === 'webspeech' && <Topic icon='📢' title='Web Speech API'>
-              <BrowserSpeechSettings />
-            </Topic>}
+            <Topic icon='📢' title={getTTSEngineName()}>
+              <TTSSetting />
+            </Topic>
           </Topics>
         </TabPanel>
 
diff --git a/src/apps/settings-modal/VoiceSettings.tsx b/src/apps/settings-modal/VoiceSettings.tsx
index fcc9725d3..f33658a57 100644
--- a/src/apps/settings-modal/VoiceSettings.tsx
+++ b/src/apps/settings-modal/VoiceSettings.tsx
@@ -1,14 +1,17 @@
 import * as React from 'react';
 
-import { FormControl } from '@mui/joy';
+import { FormControl, Option, Select } from '@mui/joy';
+import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
 
-import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat';
+import { useASREngine, useChatAutoAI, useChatMicTimeoutMs } from '../chat/store-app-chat';
 
 import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
 import { FormRadioControl } from '~/common/components/forms/FormRadioControl';
 import { LanguageSelect } from '~/common/components/LanguageSelect';
 import { useIsMobile } from '~/common/components/useMatchMedia';
-import { hasVoices, ASREngineList, TTSEngineList, TTSEngineKey } from '~/common/components/useVoiceCapabilities';
+import { ASREngineKey, ASREngineList } from '~/modules/asr/asr.client';
+import { TTSEngineKey, TTSEngineList, useTTSEngine } from '~/modules/tts/useTTSStore';
+import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
 
 export function VoiceSettings() {
   // external state
@@ -23,6 +26,18 @@ export function VoiceSettings() {
   const chatTimeoutValue: string = '' + chatTimeoutMs;
   const setChatTimeoutValue = (value: string) => value && setChatTimeoutMs(parseInt(value));
 
+  const { mayWork: hasVoices } = useTTSCapability();
+
+  const handleTTSChanged = (_event: any, newValue: TTSEngineKey | null) => {
+    if (!newValue) return;
+    setTTSEngine(newValue);
+  };
+
+  const handleASRChanged = (_event: any, newValue: ASREngineKey | null) => {
+    if (!newValue) return;
+    setASREngine(newValue);
+  };
+
   return (
     <>
       {/* LanguageSelect: moved from the UI settings (where it logically belongs), just to group things better from an UX perspective */}
@@ -63,23 +78,44 @@ export function VoiceSettings() {
         onChange={setAutoSpeak}
       />
 
-      <FormRadioControl
-        title="TTS engine"
-        description="Text to speech"
-        tooltip=""
-        options={TTSEngineList.map((i) => ({ value: i.key, label: i.label }))}
-        value={TTSEngine}
-        onChange={setTTSEngine}
-      />
+      <FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
+        <FormLabelStart title="TTS engine" description="Text to speech / voice synthesis" tooltip="" />
 
-      <FormRadioControl
-        title="ASR engine"
-        description="Automatic Speech Recognition"
-        tooltip=""
-        options={ASREngineList.map((i) => ({ value: i.key, label: i.label }))}
-        value={ASREngine}
-        onChange={setASREngine}
-      />
+        <Select
+          value={TTSEngine}
+          onChange={handleTTSChanged}
+          indicator={<KeyboardArrowDownIcon />}
+          slotProps={{
+            root: { sx: { minWidth: 200 } },
+            indicator: { sx: { opacity: 0.5 } },
+          }}
+        >
+          {TTSEngineList.map((i) => (
+            <Option key={i.key} value={i.key}>
+              {i.label}
+            </Option>
+          ))}
+        </Select>
+      </FormControl>
+
+      <FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
+        <FormLabelStart title="ASR engine" description="Automatic Speech Recognition" tooltip="" />
+        <Select
+          value={ASREngine}
+          onChange={handleASRChanged}
+          indicator={<KeyboardArrowDownIcon />}
+          slotProps={{
+            root: { sx: { minWidth: 200 } },
+            indicator: { sx: { opacity: 0.5 } },
+          }}
+        >
+          {ASREngineList.map((i) => (
+            <Option key={i.key} value={i.key}>
+              {i.label}
+            </Option>
+          ))}
+        </Select>
+      </FormControl>
     </>
   );
 }
diff --git a/src/common/components/useCapabilities.ts b/src/common/components/useCapabilities.ts
index 2d2effea2..59b0c51b0 100644
--- a/src/common/components/useCapabilities.ts
+++ b/src/common/components/useCapabilities.ts
@@ -21,18 +21,6 @@ export interface CapabilityBrowserSpeechRecognition {
 
 export { browserSpeechRecognitionCapability as useCapabilityBrowserSpeechRecognition } from './speechrecognition/useSpeechRecognition';
 
-
-/// Speech Synthesis
-
-export interface CapabilitySpeechSynthesis {
-  mayWork: boolean;
-  isConfiguredServerSide: boolean;
-  isConfiguredClientSide: boolean;
-}
-
-export { useCapability as useVoiceCapability } from '~/common/components/useVoiceCapabilities';
-
-
 /// Image Generation
 
 export interface TextToImageProvider {
diff --git a/src/common/components/useVoiceCapabilities.ts b/src/common/components/useVoiceCapabilities.ts
deleted file mode 100644
index ecfc37442..000000000
--- a/src/common/components/useVoiceCapabilities.ts
+++ /dev/null
@@ -1,87 +0,0 @@
-import { getTTSEngine } from 'src/apps/chat/store-app-chat';
-import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities';
-
-import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client';
-import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client';
-import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client';
-
-import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client';
-import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client';
-import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client';
-
-import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
-import { useBrowserSpeechVoices } from '~/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown';
-
-export type TTSEngineKey = 'elevenlabs' | 'webspeech';
-export type ASREngineKey = 'webspeech';
-
-export const TTSEngineList: { key: TTSEngineKey; label: string }[] = [
-  {
-    key: 'elevenlabs',
-    label: 'ElevenLabs',
-  },
-  {
-    key: 'webspeech',
-    label: 'Web Speech API',
-  },
-];
-
-export const ASREngineList: { key: ASREngineKey; label: string }[] = [
-  {
-    key: 'webspeech',
-    label: 'Web Speech API',
-  },
-];
-
-export function getConditionalVoices() {
-  const TTSEngine = getTTSEngine();
-  if (TTSEngine === 'elevenlabs') {
-    return useElevenLabsVoices;
-  }
-  if (TTSEngine === 'webspeech') {
-    return useBrowserSpeechVoices;
-  }
-}
-
-export function hasVoices(): boolean {
-  console.log('getConditionalVoices', getConditionalVoices()().hasVoices);
-  return getConditionalVoices()().hasVoices;
-}
-
-export function getConditionalCapability(): () => CapabilitySpeechSynthesis {
-  const TTSEngine = getTTSEngine();
-  if (TTSEngine === 'elevenlabs') {
-    return useElevenlabsCapability;
-  }
-  if (TTSEngine === 'webspeech') {
-    return useBrowserSpeechSynthesisCapability;
-  }
-  throw new Error('TTSEngine is not found');
-}
-
-export function useCapability(): CapabilitySpeechSynthesis {
-  return getConditionalCapability()();
-}
-
-export async function speakText(text: string, voiceId?: string) {
-  const TTSEngine = getTTSEngine();
-  if (TTSEngine === 'elevenlabs') {
-    return await elevenlabsSpeakText(text, voiceId);
-  } 
-  if (TTSEngine === 'webspeech') {
-    return await browserSpeechSynthesisSpeakText(text, voiceId);
-  }
-}
-
-// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined;
-
-export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
-  const TTSEngine = getTTSEngine();
-  if (TTSEngine === 'elevenlabs') {
-    return await EXPERIMENTAL_elevenlabsSpeakTextStream(text, voiceId);
-  }
-  if (TTSEngine === 'webspeech') {
-    return await EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream(text, voiceId);
-  }
-  throw new Error('TTSEngine is not found');
-}
diff --git a/src/modules/asr/asr.client.ts b/src/modules/asr/asr.client.ts
new file mode 100644
index 000000000..30db9cf25
--- /dev/null
+++ b/src/modules/asr/asr.client.ts
@@ -0,0 +1,8 @@
+export type ASREngineKey = 'webspeech';
+
+export const ASREngineList: { key: ASREngineKey; label: string }[] = [
+  {
+    key: 'webspeech',
+    label: 'Web Speech API',
+  },
+];
diff --git a/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts b/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts
deleted file mode 100644
index 2814a760a..000000000
--- a/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts
+++ /dev/null
@@ -1,48 +0,0 @@
-import { CapabilitySpeechSynthesis } from "~/common/components/useCapabilities";
-import { getBrowseVoiceId } from "./store-module-browser";
-
-export function useCapability(): CapabilitySpeechSynthesis {
-  const synth = window.speechSynthesis;
-  const voices = synth.getVoices();
-  const isConfiguredServerSide = false;
-  const isConfiguredClientSide = true;
-  const mayWork = voices.length > 0;
-  return { mayWork, isConfiguredServerSide, isConfiguredClientSide };
-}
-
-
-export async function speakText(text: string, voiceId?: string) {
-  if (!(text?.trim())) return;
-  
-  try {
-    const synth = window.speechSynthesis;
-    const utterThis = new SpeechSynthesisUtterance(text);
-    const voices = synth.getVoices();
-    voiceId = voiceId || getBrowseVoiceId();
-    utterThis.voice = voices.find((voice) => voiceId === voice.name) || null;
-    synth.speak(utterThis);
-  } catch (error) {
-    console.error('Error playing first text:', error);
-  }
-}
-
-export async function cancel() {
-  const synth = window.speechSynthesis;
-  synth.cancel();
-}
-
-export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
-  if (!(text?.trim())) return;
-
-  try {
-    const synth = window.speechSynthesis;
-    const utterThis = new SpeechSynthesisUtterance(text);
-    const voices = synth.getVoices();
-    voiceId = voiceId || getBrowseVoiceId();
-    utterThis.voice = voices.find((voice) => voiceId === voice.name) || null;
-    synth.speak(utterThis);
-  } catch (error) {
-    // has happened once in months of testing, not sure what was the cause
-    console.error('EXPERIMENTAL_speakTextStream:', error);
-  }
-}
\ No newline at end of file
diff --git a/src/modules/elevenlabs/elevenlabs.client.ts b/src/modules/elevenlabs/elevenlabs.client.ts
deleted file mode 100644
index 9e7e5ed09..000000000
--- a/src/modules/elevenlabs/elevenlabs.client.ts
+++ /dev/null
@@ -1,98 +0,0 @@
-import { getBackendCapabilities } from '~/modules/backend/store-backend-capabilities';
-
-import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer';
-import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
-import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities';
-import { frontendSideFetch } from '~/common/util/clientFetchers';
-import { useUIPreferencesStore } from '~/common/state/store-ui';
-
-import type { SpeechInputSchema } from './elevenlabs.router';
-import { getElevenLabsData, useElevenLabsData } from './store-module-elevenlabs';
-
-
-export const isValidElevenLabsApiKey = (apiKey?: string) => !!apiKey && apiKey.trim()?.length >= 32;
-
-export const isElevenLabsEnabled = (apiKey?: string) => apiKey
-  ? isValidElevenLabsApiKey(apiKey)
-  : getBackendCapabilities().hasVoiceElevenLabs;
-
-
-export function useCapability(): CapabilitySpeechSynthesis {
-  const [clientApiKey, voiceId] = useElevenLabsData();
-  const isConfiguredServerSide = getBackendCapabilities().hasVoiceElevenLabs;
-  const isConfiguredClientSide = clientApiKey ? isValidElevenLabsApiKey(clientApiKey) : false;
-  const mayWork = isConfiguredServerSide || isConfiguredClientSide || !!voiceId;
-  return { mayWork, isConfiguredServerSide, isConfiguredClientSide };
-}
-
-
-export async function speakText(text: string, voiceId?: string) {
-  if (!(text?.trim())) return;
-
-  const { elevenLabsApiKey, elevenLabsVoiceId } = getElevenLabsData();
-  if (!isElevenLabsEnabled(elevenLabsApiKey)) return;
-
-  const { preferredLanguage } = useUIPreferencesStore.getState();
-  const nonEnglish = !(preferredLanguage?.toLowerCase()?.startsWith('en'));
-
-  try {
-    const edgeResponse = await frontendFetchAPIElevenLabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, false);
-    const audioBuffer = await edgeResponse.arrayBuffer();
-    await AudioPlayer.playBuffer(audioBuffer);
-  } catch (error) {
-    console.error('Error playing first text:', error);
-  }
-}
-
-// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined;
-
-export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
-  if (!(text?.trim())) return;
-
-  const { elevenLabsApiKey, elevenLabsVoiceId } = getElevenLabsData();
-  if (!isElevenLabsEnabled(elevenLabsApiKey)) return;
-
-  const { preferredLanguage } = useUIPreferencesStore.getState();
-  const nonEnglish = !(preferredLanguage?.toLowerCase()?.startsWith('en'));
-
-  try {
-    const edgeResponse = await frontendFetchAPIElevenLabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, true);
-
-    // if (!liveAudioPlayer)
-    const liveAudioPlayer = new AudioLivePlayer();
-    // fire/forget
-    void liveAudioPlayer.EXPERIMENTAL_playStream(edgeResponse);
-
-  } catch (error) {
-    // has happened once in months of testing, not sure what was the cause
-    console.error('EXPERIMENTAL_speakTextStream:', error);
-  }
-}
-
-
-/**
- * Note: we have to use this client-side API instead of TRPC because of ArrayBuffers..
- */
-async function frontendFetchAPIElevenLabsSpeech(text: string, elevenLabsApiKey: string, elevenLabsVoiceId: string, nonEnglish: boolean, streaming: boolean): Promise<Response> {
-  // NOTE: hardcoded 1000 as a failsafe, since the API will take very long and consume lots of credits for longer texts
-  const speechInput: SpeechInputSchema = {
-    elevenKey: elevenLabsApiKey,
-    text: text.slice(0, 1000),
-    voiceId: elevenLabsVoiceId,
-    nonEnglish,
-    ...(streaming && { streaming: true, streamOptimization: 4 }),
-  };
-
-  const response = await frontendSideFetch('/api/elevenlabs/speech', {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify(speechInput),
-  });
-
-  if (!response.ok) {
-    const errorData = await response.json();
-    throw new Error(errorData.error || errorData.message || 'Unknown error');
-  }
-
-  return response;
-}
\ No newline at end of file
diff --git a/src/modules/tts/tts.client.hooks.ts b/src/modules/tts/tts.client.hooks.ts
new file mode 100644
index 000000000..55c685729
--- /dev/null
+++ b/src/modules/tts/tts.client.hooks.ts
@@ -0,0 +1,11 @@
+import { getTTSEngine } from './useTTSStore';
+import { findTTSVendor } from './vendors/vendors.registry';
+
+export function useTTSCapability() {
+  const TTSEngine = getTTSEngine();
+  const vendor = findTTSVendor(TTSEngine);
+  if (!vendor) {
+    throw new Error(`No TTS Engine found for ${TTSEngine}`);
+  }
+  return vendor.getCapabilityInfo();
+}
diff --git a/src/modules/tts/tts.client.ts b/src/modules/tts/tts.client.ts
new file mode 100644
index 000000000..c96a09eca
--- /dev/null
+++ b/src/modules/tts/tts.client.ts
@@ -0,0 +1,41 @@
+import { getTTSEngine } from './useTTSStore';
+import { findTTSVendor } from './vendors/vendors.registry';
+
+export async function speakText(text: string, voiceId?: string) {
+  const TTSEngine = getTTSEngine();
+  const vendor = findTTSVendor(TTSEngine);
+  if (!vendor) {
+    throw new Error(`No TTS Engine found for ${TTSEngine}`);
+  }
+  return vendor.speakText(text, voiceId);
+}
+
+export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
+  const TTSEngine = getTTSEngine();
+  const vendor = findTTSVendor(TTSEngine);
+  if (!vendor) {
+    throw new Error(`No TTS Engine found for ${TTSEngine}`);
+  }
+  return vendor.EXPERIMENTAL_speakTextStream(text, voiceId);
+}
+
+export function cancel() {
+  const TTSEngine = getTTSEngine();
+  const vendor = findTTSVendor(TTSEngine);
+  if (!vendor) {
+    throw new Error(`No TTS Engine found for ${TTSEngine}`);
+  }
+  if (!vendor.cancel) {
+    return;
+  }
+  return vendor.cancel();
+}
+
+export function getName() {
+  const TTSEngine = getTTSEngine();
+  const vendor = findTTSVendor(TTSEngine);
+  if (!vendor) {
+    throw new Error(`No TTS Engine found for ${TTSEngine}`);
+  }
+  return vendor.name;
+}
\ No newline at end of file
diff --git a/src/modules/tts/tts.setting.tsx b/src/modules/tts/tts.setting.tsx
new file mode 100644
index 000000000..3bcb26d14
--- /dev/null
+++ b/src/modules/tts/tts.setting.tsx
@@ -0,0 +1,11 @@
+import { getTTSEngine } from './useTTSStore';
+import { findTTSVendor } from './vendors/vendors.registry';
+
+export function TTSSetting() {
+  const TTSEngine = getTTSEngine();
+  const vendor = findTTSVendor(TTSEngine);
+  if (!vendor || !vendor.TTSSettingsComponent) {
+    return <></>;
+  }
+  return <vendor.TTSSettingsComponent />;
+}
diff --git a/src/modules/tts/useTTSStore.ts b/src/modules/tts/useTTSStore.ts
new file mode 100644
index 000000000..6100a65c0
--- /dev/null
+++ b/src/modules/tts/useTTSStore.ts
@@ -0,0 +1,34 @@
+import { create } from 'zustand';
+import { persist } from 'zustand/middleware';
+import { useShallow } from 'zustand/react/shallow';
+
+export type TTSEngineKey = 'elevenlabs' | 'webspeech';
+
+export const TTSEngineList: { key: TTSEngineKey; label: string }[] = [
+  {
+    key: 'elevenlabs',
+    label: 'ElevenLabs',
+  },
+  {
+    key: 'webspeech',
+    label: 'Web Speech API',
+  },
+];
+
+interface TTSStore {
+  TTSEngine: TTSEngineKey;
+  setTTSEngine: (TTSEngine: TTSEngineKey) => void;
+}
+
+const useTTSStore = create<TTSStore>()(
+  persist(
+    (_set, _get) => ({
+      TTSEngine: TTSEngineList[0].key,
+      setTTSEngine: (TTSEngine: TTSEngineKey) => _set({ TTSEngine }),
+    }),
+    { name: 'tts' },
+  ),
+);
+
+export const useTTSEngine = (): [TTSEngineKey, (TTSEngine: TTSEngineKey) => void] => useTTSStore(useShallow((state) => [state.TTSEngine, state.setTTSEngine]));
+export const getTTSEngine = () => useTTSStore.getState().TTSEngine;
diff --git a/src/modules/tts/vendors/ISpeechSynthesis.ts b/src/modules/tts/vendors/ISpeechSynthesis.ts
new file mode 100644
index 000000000..8a40fca41
--- /dev/null
+++ b/src/modules/tts/vendors/ISpeechSynthesis.ts
@@ -0,0 +1,30 @@
+import type React from 'react';
+
+import type { SvgIconProps } from '@mui/joy';
+import { TTSEngineKey } from './vendors.registry';
+
+export interface ISpeechSynthesis<> {
+  readonly id: TTSEngineKey;
+  readonly name: string;
+  readonly location: 'local' | 'cloud';
+
+  // components
+  // readonly Icon: React.FunctionComponent<SvgIconProps>;
+  readonly TTSSettingsComponent?: React.ComponentType;
+
+  /// abstraction interface ///
+
+  hasVoices?(): boolean;
+  getCapabilityInfo(): CapabilitySpeechSynthesis;
+  speakText(text: string, voiceId?: string): Promise<void>;
+  EXPERIMENTAL_speakTextStream(text: string, voiceId?: string): Promise<void>;
+  cancel?(): Promise<void>;
+  stop?(): Promise<void>;
+  resume?(): Promise<void>;
+}
+
+export interface CapabilitySpeechSynthesis {
+  mayWork: boolean;
+  isConfiguredServerSide: boolean;
+  isConfiguredClientSide: boolean;
+}
diff --git a/src/modules/elevenlabs/ElevenlabsSettings.tsx b/src/modules/tts/vendors/elevenlabs/ElevenlabsSettings.tsx
similarity index 87%
rename from src/modules/elevenlabs/ElevenlabsSettings.tsx
rename to src/modules/tts/vendors/elevenlabs/ElevenlabsSettings.tsx
index 97ebc64b3..5a93fc12f 100644
--- a/src/modules/elevenlabs/ElevenlabsSettings.tsx
+++ b/src/modules/tts/vendors/elevenlabs/ElevenlabsSettings.tsx
@@ -5,9 +5,8 @@ import { FormControl } from '@mui/joy';
 import { AlreadySet } from '~/common/components/AlreadySet';
 import { FormInputKey } from '~/common/components/forms/FormInputKey';
 import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
-import { useVoiceCapability } from '~/common/components/useCapabilities';
 
-import { isElevenLabsEnabled } from './elevenlabs.client';
+import { elevenlabs, isElevenLabsEnabled } from './elevenlabs.vendor';
 import { useElevenLabsVoiceDropdown } from './useElevenLabsVoiceDropdown';
 import { useElevenLabsApiKey } from './store-module-elevenlabs';
 
@@ -16,7 +15,7 @@ export function ElevenlabsSettings() {
 
   // external state
   const [apiKey, setApiKey] = useElevenLabsApiKey();
-  const { isConfiguredServerSide } = useVoiceCapability();
+  const { isConfiguredServerSide } = elevenlabs.getCapabilityInfo();
   const { voicesDropdown } = useElevenLabsVoiceDropdown(true);
 
 
diff --git a/src/modules/elevenlabs/elevenlabs.router.ts b/src/modules/tts/vendors/elevenlabs/elevenlabs.router.ts
similarity index 100%
rename from src/modules/elevenlabs/elevenlabs.router.ts
rename to src/modules/tts/vendors/elevenlabs/elevenlabs.router.ts
diff --git a/src/modules/elevenlabs/elevenlabs.server.ts b/src/modules/tts/vendors/elevenlabs/elevenlabs.server.ts
similarity index 100%
rename from src/modules/elevenlabs/elevenlabs.server.ts
rename to src/modules/tts/vendors/elevenlabs/elevenlabs.server.ts
diff --git a/src/modules/tts/vendors/elevenlabs/elevenlabs.vendor.ts b/src/modules/tts/vendors/elevenlabs/elevenlabs.vendor.ts
new file mode 100644
index 000000000..46b1958e3
--- /dev/null
+++ b/src/modules/tts/vendors/elevenlabs/elevenlabs.vendor.ts
@@ -0,0 +1,107 @@
+import { getBackendCapabilities } from '~/modules/backend/store-backend-capabilities';
+
+import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer';
+import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
+import { frontendSideFetch } from '~/common/util/clientFetchers';
+import { useUIPreferencesStore } from '~/common/state/store-ui';
+
+import type { SpeechInputSchema } from './elevenlabs.router';
+import { getElevenLabsData, useElevenLabsData } from './store-module-elevenlabs';
+import { ElevenlabsSettings } from './ElevenlabsSettings';
+import { CapabilitySpeechSynthesis, ISpeechSynthesis } from '../ISpeechSynthesis';
+
+const isValidElevenLabsApiKey = (apiKey?: string) => !!apiKey && apiKey.trim()?.length >= 32;
+
+export const isElevenLabsEnabled = (apiKey?: string) => (apiKey ? isValidElevenLabsApiKey(apiKey) : getBackendCapabilities().hasVoiceElevenLabs);
+
+/**
+ * Note: we have to use this client-side API instead of TRPC because of ArrayBuffers..
+ */
+async function frontendFetchAPIElevenLabsSpeech(
+  text: string,
+  elevenLabsApiKey: string,
+  elevenLabsVoiceId: string,
+  nonEnglish: boolean,
+  streaming: boolean,
+): Promise<Response> {
+  // NOTE: hardcoded 1000 as a failsafe, since the API will take very long and consume lots of credits for longer texts
+  const speechInput: SpeechInputSchema = {
+    elevenKey: elevenLabsApiKey,
+    text: text.slice(0, 1000),
+    voiceId: elevenLabsVoiceId,
+    nonEnglish,
+    ...(streaming && { streaming: true, streamOptimization: 4 }),
+  };
+
+  const response = await frontendSideFetch('/api/elevenlabs/speech', {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(speechInput),
+  });
+
+  if (!response.ok) {
+    const errorData = await response.json();
+    throw new Error(errorData.error || errorData.message || 'Unknown error');
+  }
+
+  return response;
+}
+
+export const elevenlabs: ISpeechSynthesis = {
+  id: 'webspeech',
+  name: 'Web Speech API',
+  location: 'cloud',
+
+  // components
+  TTSSettingsComponent: ElevenlabsSettings,
+
+  // functions
+  getCapabilityInfo(): CapabilitySpeechSynthesis {
+    const {elevenLabsApiKey:clientApiKey, elevenLabsVoiceId:voiceId} = getElevenLabsData();
+    const isConfiguredServerSide = getBackendCapabilities().hasVoiceElevenLabs;
+    const isConfiguredClientSide = clientApiKey ? isValidElevenLabsApiKey(clientApiKey) : false;
+    const mayWork = isConfiguredServerSide || isConfiguredClientSide || !!voiceId;
+    return { mayWork, isConfiguredServerSide, isConfiguredClientSide };
+  },
+
+  async speakText(text: string, voiceId?: string) {
+    if (!text?.trim()) return;
+
+    const { elevenLabsApiKey, elevenLabsVoiceId } = getElevenLabsData();
+    if (!isElevenLabsEnabled(elevenLabsApiKey)) return;
+
+    const { preferredLanguage } = useUIPreferencesStore.getState();
+    const nonEnglish = !preferredLanguage?.toLowerCase()?.startsWith('en');
+
+    try {
+      const edgeResponse = await frontendFetchAPIElevenLabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, false);
+      const audioBuffer = await edgeResponse.arrayBuffer();
+      await AudioPlayer.playBuffer(audioBuffer);
+    } catch (error) {
+      console.error('Error playing first text:', error);
+    }
+  },
+
+  // let liveAudioPlayer: LiveAudioPlayer | undefined = undefined;
+  async EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
+    if (!text?.trim()) return;
+
+    const { elevenLabsApiKey, elevenLabsVoiceId } = getElevenLabsData();
+    if (!isElevenLabsEnabled(elevenLabsApiKey)) return;
+
+    const { preferredLanguage } = useUIPreferencesStore.getState();
+    const nonEnglish = !preferredLanguage?.toLowerCase()?.startsWith('en');
+
+    try {
+      const edgeResponse = await frontendFetchAPIElevenLabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, true);
+
+      // if (!liveAudioPlayer)
+      const liveAudioPlayer = new AudioLivePlayer();
+      // fire/forget
+      void liveAudioPlayer.EXPERIMENTAL_playStream(edgeResponse);
+    } catch (error) {
+      // has happened once in months of testing, not sure what was the cause
+      console.error('EXPERIMENTAL_speakTextStream:', error);
+    }
+  },
+};
diff --git a/src/modules/elevenlabs/store-module-elevenlabs.ts b/src/modules/tts/vendors/elevenlabs/store-module-elevenlabs.ts
similarity index 100%
rename from src/modules/elevenlabs/store-module-elevenlabs.ts
rename to src/modules/tts/vendors/elevenlabs/store-module-elevenlabs.ts
diff --git a/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx b/src/modules/tts/vendors/elevenlabs/useElevenLabsVoiceDropdown.tsx
similarity index 98%
rename from src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx
rename to src/modules/tts/vendors/elevenlabs/useElevenLabsVoiceDropdown.tsx
index 24de0b003..9b2bb0fa4 100644
--- a/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx
+++ b/src/modules/tts/vendors/elevenlabs/useElevenLabsVoiceDropdown.tsx
@@ -8,7 +8,7 @@ import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
 import { apiQuery } from '~/common/util/trpc.client';
 
 import { VoiceSchema } from './elevenlabs.router';
-import { isElevenLabsEnabled } from './elevenlabs.client';
+import { isElevenLabsEnabled } from './elevenlabs.vendor';
 import { useElevenLabsApiKey, useElevenLabsVoiceId } from './store-module-elevenlabs';
 
 
diff --git a/src/modules/tts/vendors/vendors.registry.ts b/src/modules/tts/vendors/vendors.registry.ts
new file mode 100644
index 000000000..75319650f
--- /dev/null
+++ b/src/modules/tts/vendors/vendors.registry.ts
@@ -0,0 +1,19 @@
+import { TTSEngineKey } from '../useTTSStore';
+import { elevenlabs } from './elevenlabs/elevenlabs.vendor';
+import { ISpeechSynthesis } from './ISpeechSynthesis';
+import { webspeech } from './webspeech/webspeech.vendor';
+
+/** Global: Vendor Instances Registry **/
+const MODEL_VENDOR_REGISTRY: Record<TTSEngineKey, ISpeechSynthesis> = {
+  elevenlabs:elevenlabs,
+  webspeech:webspeech,
+} as Record<string, ISpeechSynthesis>;
+
+export function findAllTTSVendors(): ISpeechSynthesis[] {
+  const modelVendors = Object.values(MODEL_VENDOR_REGISTRY);
+  return modelVendors;
+}
+
+export function findTTSVendor(TTSEngineKey?: TTSEngineKey): ISpeechSynthesis | null {
+  return TTSEngineKey ? ((MODEL_VENDOR_REGISTRY[TTSEngineKey] as ISpeechSynthesis) ?? null) : null;
+}
diff --git a/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx b/src/modules/tts/vendors/webspeech/WebspeechSettings.tsx
similarity index 95%
rename from src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx
rename to src/modules/tts/vendors/webspeech/WebspeechSettings.tsx
index 4a6fc6441..d4c4cf801 100644
--- a/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx
+++ b/src/modules/tts/vendors/webspeech/WebspeechSettings.tsx
@@ -6,13 +6,13 @@ import CloseRounded from '@mui/icons-material/CloseRounded';
 import { addSnackbar } from '~/common/components/snackbar/useSnackbarsStore';
 
 import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
-import { useBrowserSpeechVoiceDropdown } from './useBrowserSpeechVoiceDropdown';
-import { useLanguageCodeForFilter } from './store-module-browser';
+import { useBrowserSpeechVoiceDropdown } from './useWebspeechVoiceDropdown';
+import { useLanguageCodeForFilter } from './store-module-webspeech';
 
 // languages are defined as a JSON file
 import languages from './preSelect/Languages.json';
 
-export function BrowserSpeechSettings() {
+export function WebspeechSettings() {
   // state
   const [testUtterance, setTestUtterance] = React.useState<string | null>(null);
   const [voiceNameFilters, setVoiceNameFilters] = React.useState<string[] | null>(null);
diff --git a/src/modules/browser/speech-synthesis/preSelect/Languages.json b/src/modules/tts/vendors/webspeech/preSelect/Languages.json
similarity index 100%
rename from src/modules/browser/speech-synthesis/preSelect/Languages.json
rename to src/modules/tts/vendors/webspeech/preSelect/Languages.json
diff --git a/src/modules/browser/speech-synthesis/store-module-browser.tsx b/src/modules/tts/vendors/webspeech/store-module-webspeech.ts
similarity index 100%
rename from src/modules/browser/speech-synthesis/store-module-browser.tsx
rename to src/modules/tts/vendors/webspeech/store-module-webspeech.ts
diff --git a/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx b/src/modules/tts/vendors/webspeech/useWebspeechVoiceDropdown.tsx
similarity index 96%
rename from src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx
rename to src/modules/tts/vendors/webspeech/useWebspeechVoiceDropdown.tsx
index 9db7d462e..c0ff014ea 100644
--- a/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx
+++ b/src/modules/tts/vendors/webspeech/useWebspeechVoiceDropdown.tsx
@@ -4,8 +4,8 @@ import { CircularProgress, Option, Select } from '@mui/joy';
 import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
 import RecordVoiceOverTwoToneIcon from '@mui/icons-material/RecordVoiceOverTwoTone';
 
-import { useBrowseVoiceId } from './store-module-browser';
-import { speakText, cancel } from './browser.speechSynthesis.client';
+import { useBrowseVoiceId } from './store-module-webspeech';
+import { speakText, cancel } from '../../tts.client';
 
 function VoicesDropdown(props: {
   isValidKey: boolean;
diff --git a/src/modules/tts/vendors/webspeech/webspeech.vendor.ts b/src/modules/tts/vendors/webspeech/webspeech.vendor.ts
new file mode 100644
index 000000000..9242ab50c
--- /dev/null
+++ b/src/modules/tts/vendors/webspeech/webspeech.vendor.ts
@@ -0,0 +1,65 @@
+import { getBrowseVoiceId } from './store-module-webspeech';
+import { CapabilitySpeechSynthesis, ISpeechSynthesis } from '../ISpeechSynthesis';
+import { WebspeechSettings } from './WebspeechSettings';
+
+export const webspeech: ISpeechSynthesis = {
+  id: 'webspeech',
+  name: 'Web Speech API',
+  location: 'cloud',
+
+  // components
+  TTSSettingsComponent: WebspeechSettings,
+
+  // functions
+
+  getCapabilityInfo(): CapabilitySpeechSynthesis {
+    const synth = window.speechSynthesis;
+    const voices = synth.getVoices();
+    const isConfiguredServerSide = false;
+    const isConfiguredClientSide = true;
+    const mayWork = voices.length > 0;
+    return { mayWork, isConfiguredServerSide, isConfiguredClientSide };
+  },
+
+  hasVoices() {
+    const synth = window.speechSynthesis;
+    const voices = synth.getVoices();
+    return voices.length > 0;
+  },
+
+  async speakText(text: string, voiceId?: string) {
+    if (!text?.trim()) return;
+
+    try {
+      const synth = window.speechSynthesis;
+      const utterThis = new SpeechSynthesisUtterance(text);
+      const voices = synth.getVoices();
+      voiceId = voiceId || getBrowseVoiceId();
+      utterThis.voice = voices.find((voice) => voiceId === voice.name) || null;
+      synth.speak(utterThis);
+    } catch (error) {
+      console.error('Error playing first text:', error);
+    }
+  },
+
+  async cancel() {
+    const synth = window.speechSynthesis;
+    synth.cancel();
+  },
+
+  async EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) {
+    if (!text?.trim()) return;
+
+    try {
+      const synth = window.speechSynthesis;
+      const utterThis = new SpeechSynthesisUtterance(text);
+      const voices = synth.getVoices();
+      voiceId = voiceId || getBrowseVoiceId();
+      utterThis.voice = voices.find((voice) => voiceId === voice.name) || null;
+      synth.speak(utterThis);
+    } catch (error) {
+      // has happened once in months of testing, not sure what was the cause
+      console.error('EXPERIMENTAL_speakTextStream:', error);
+    }
+  },
+};
diff --git a/src/server/api/trpc.router-edge.ts b/src/server/api/trpc.router-edge.ts
index 54fce437b..e2d3fac67 100644
--- a/src/server/api/trpc.router-edge.ts
+++ b/src/server/api/trpc.router-edge.ts
@@ -2,7 +2,7 @@ import { createTRPCRouter } from './trpc.server';
 
 import { aixRouter } from '~/modules/aix/server/api/aix.router';
 import { backendRouter } from '~/modules/backend/backend.router';
-import { elevenlabsRouter } from '~/modules/elevenlabs/elevenlabs.router';
+import { elevenlabsRouter } from '~/modules/tts/vendors/elevenlabs/elevenlabs.router';
 import { googleSearchRouter } from '~/modules/google/search.router';
 import { llmAnthropicRouter } from '~/modules/llms/server/anthropic/anthropic.router';
 import { llmGeminiRouter } from '~/modules/llms/server/gemini/gemini.router';