From c78694f1931cbaeeaa7958fbe4af80192022ebce Mon Sep 17 00:00:00 2001 From: zoollcar Date: Mon, 14 Oct 2024 13:41:28 +0800 Subject: [PATCH 1/3] TTS for version2 --- pages/info/debug.tsx | 4 +- src/apps/call/CallWizard.tsx | 4 +- src/apps/call/Telephone.tsx | 17 ++- src/apps/chat/AppChat.tsx | 2 +- src/apps/chat/components/ChatMessageList.tsx | 4 +- src/apps/chat/store-app-chat.ts | 20 +++ src/apps/settings-modal/SettingsModal.tsx | 12 +- src/apps/settings-modal/VoiceSettings.tsx | 27 +++- src/common/components/useCapabilities.ts | 6 +- src/common/components/useVoiceCapabilities.ts | 74 +++++++++++ .../BrowserSpeechSettings.tsx | 111 ++++++++++++++++ .../browser.speechSynthesis.client.ts | 48 +++++++ .../speech-synthesis/preSelect/Languages.json | 75 +++++++++++ .../speech-synthesis/store-module-browser.tsx | 40 ++++++ .../useBrowserSpeechVoiceDropdown.tsx | 124 ++++++++++++++++++ src/modules/elevenlabs/ElevenlabsSettings.tsx | 4 +- src/modules/elevenlabs/elevenlabs.client.ts | 4 +- .../elevenlabs/useElevenLabsVoiceDropdown.tsx | 4 + 18 files changed, 555 insertions(+), 25 deletions(-) create mode 100644 src/common/components/useVoiceCapabilities.ts create mode 100644 src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx create mode 100644 src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts create mode 100644 src/modules/browser/speech-synthesis/preSelect/Languages.json create mode 100644 src/modules/browser/speech-synthesis/store-module-browser.tsx create mode 100644 src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx diff --git a/pages/info/debug.tsx b/pages/info/debug.tsx index 066e70b9a..27ffbbe49 100644 --- a/pages/info/debug.tsx +++ b/pages/info/debug.tsx @@ -20,7 +20,7 @@ import { ROUTE_APP_CHAT, ROUTE_INDEX } from '~/common/app.routes'; import { incrementalNewsVersion, useAppNewsStateStore } from '../../src/apps/news/news.version'; // capabilities access -import { useCapabilityBrowserSpeechRecognition, useCapabilityElevenLabs, useCapabilityTextToImage } from '~/common/components/useCapabilities'; +import { useCapabilityBrowserSpeechRecognition, useVoiceCapability, useCapabilityTextToImage } from '~/common/components/useCapabilities'; // stores access import { getLLMsDebugInfo } from '~/common/stores/llms/store-llms'; @@ -96,7 +96,7 @@ function AppDebug() { const cProduct = { capabilities: { mic: useCapabilityBrowserSpeechRecognition(), - elevenLabs: useCapabilityElevenLabs(), + elevenLabs: useVoiceCapability(), textToImage: useCapabilityTextToImage(), }, models: getLLMsDebugInfo(), diff --git a/src/apps/call/CallWizard.tsx b/src/apps/call/CallWizard.tsx index ab8a7ad6f..d7bdb767a 100644 --- a/src/apps/call/CallWizard.tsx +++ b/src/apps/call/CallWizard.tsx @@ -12,7 +12,7 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded'; import { animationColorRainbow } from '~/common/util/animUtils'; import { navigateBack } from '~/common/app.routes'; import { optimaOpenPreferences } from '~/common/layout/optima/useOptima'; -import { useCapabilityBrowserSpeechRecognition, useCapabilityElevenLabs } from '~/common/components/useCapabilities'; +import { useCapabilityBrowserSpeechRecognition, useVoiceCapability } from '~/common/components/useCapabilities'; import { useChatStore } from '~/common/stores/chat/store-chats'; import { useUICounter } from '~/common/state/store-ui'; @@ -45,7 +45,7 @@ export function CallWizard(props: { strict?: boolean, conversationId: string | n // external state const recognition = useCapabilityBrowserSpeechRecognition(); - const synthesis = useCapabilityElevenLabs(); + const synthesis = useVoiceCapability(); const chatIsEmpty = useChatStore(state => { if (!props.conversationId) return false; diff --git a/src/apps/call/Telephone.tsx b/src/apps/call/Telephone.tsx index ebbdd18bc..ba0b108b1 100644 --- a/src/apps/call/Telephone.tsx +++ b/src/apps/call/Telephone.tsx @@ -13,7 +13,7 @@ import { ScrollToBottom } from '~/common/scroll-to-bottom/ScrollToBottom'; import { ScrollToBottomButton } from '~/common/scroll-to-bottom/ScrollToBottomButton'; import { useChatLLMDropdown } from '../chat/components/layout-bar/useLLMDropdown'; -import { EXPERIMENTAL_speakTextStream } from '~/modules/elevenlabs/elevenlabs.client'; +import { EXPERIMENTAL_speakTextStream } from '~/common/components/useVoiceCapabilities'; import { SystemPurposeId, SystemPurposes } from '../../data'; import { llmStreamingChatGenerate, VChatMessageIn } from '~/modules/llms/llm.client'; import { useElevenLabsVoiceDropdown } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown'; @@ -245,13 +245,22 @@ export function Telephone(props: { // perform completion responseAbortController.current = new AbortController(); let finalText = ''; + let currentSentence = ''; let error: any | null = null; setPersonaTextInterim('💭...'); llmStreamingChatGenerate(chatLLMId, callPrompt, 'call', callMessages[0].id, null, null, responseAbortController.current.signal, ({ textSoFar }) => { const text = textSoFar?.trim(); if (text) { - finalText = text; setPersonaTextInterim(text); + + // Maintain and say the current sentence + if (/[.,!?]$/.test(text)) { + currentSentence = text.substring(finalText?.length) + finalText = text + if (currentSentence?.length >= 1) + void EXPERIMENTAL_speakTextStream(currentSentence, personaVoiceId); + } + currentSentence = text.substring(finalText?.length) // to be added to the final text } }).catch((err: DOMException) => { if (err?.name !== 'AbortError') @@ -261,8 +270,8 @@ export function Telephone(props: { if (finalText || error) setCallMessages(messages => [...messages, createDMessageTextContent('assistant', finalText + (error ? ` (ERROR: ${error.message || error.toString()})` : ''))]); // [state] append assistant:call_response // fire/forget - if (finalText?.length >= 1) - void EXPERIMENTAL_speakTextStream(finalText, personaVoiceId); + if (currentSentence?.length >= 1) + void EXPERIMENTAL_speakTextStream(currentSentence, personaVoiceId); }); return () => { diff --git a/src/apps/chat/AppChat.tsx b/src/apps/chat/AppChat.tsx index 0c3d17b52..a6107ec1a 100644 --- a/src/apps/chat/AppChat.tsx +++ b/src/apps/chat/AppChat.tsx @@ -10,7 +10,7 @@ import { FlattenerModal } from '~/modules/aifn/flatten/FlattenerModal'; import { TradeConfig, TradeModal } from '~/modules/trade/TradeModal'; import { downloadSingleChat, importConversationsFromFilesAtRest, openConversationsAtRestPicker } from '~/modules/trade/trade.client'; import { imaginePromptFromTextOrThrow } from '~/modules/aifn/imagine/imaginePromptFromText'; -import { speakText } from '~/modules/elevenlabs/elevenlabs.client'; +import { speakText } from '~/common/components/useVoiceCapabilities'; import { useAreBeamsOpen } from '~/modules/beam/store-beam.hooks'; import { useCapabilityTextToImage } from '~/modules/t2i/t2i.client'; diff --git a/src/apps/chat/components/ChatMessageList.tsx b/src/apps/chat/components/ChatMessageList.tsx index 8c6cec862..0111b564f 100644 --- a/src/apps/chat/components/ChatMessageList.tsx +++ b/src/apps/chat/components/ChatMessageList.tsx @@ -19,7 +19,7 @@ import { getConversation, useChatStore } from '~/common/stores/chat/store-chats' import { openFileForAttaching } from '~/common/components/ButtonAttachFiles'; import { optimaOpenPreferences } from '~/common/layout/optima/useOptima'; import { useBrowserTranslationWarning } from '~/common/components/useIsBrowserTranslating'; -import { useCapabilityElevenLabs } from '~/common/components/useCapabilities'; +import { useVoiceCapability } from '~/common/components/useCapabilities'; import { useChatOverlayStore } from '~/common/chat-overlay/store-chat-overlay'; import { useScrollToBottom } from '~/common/scroll-to-bottom/useScrollToBottom'; @@ -75,7 +75,7 @@ export function ChatMessageList(props: { _composerInReferenceToCount: state.inReferenceTo?.length ?? 0, ephemerals: state.ephemerals?.length ? state.ephemerals : null, }))); - const { mayWork: isSpeakable } = useCapabilityElevenLabs(); + const { mayWork: isSpeakable } = useVoiceCapability(); // derived state const { conversationHandler, conversationId, capabilityHasT2I, onConversationBranch, onConversationExecuteHistory, onTextDiagram, onTextImagine, onTextSpeak } = props; diff --git a/src/apps/chat/store-app-chat.ts b/src/apps/chat/store-app-chat.ts index f3fcc163e..4760fe161 100644 --- a/src/apps/chat/store-app-chat.ts +++ b/src/apps/chat/store-app-chat.ts @@ -1,6 +1,7 @@ import { create } from 'zustand'; import { persist } from 'zustand/middleware'; import { useShallow } from 'zustand/react/shallow'; +import { ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities'; import type { DLLMId } from '~/common/stores/llms/llms.types'; @@ -51,6 +52,12 @@ interface AppChatStore { micTimeoutMs: number; setMicTimeoutMs: (micTimeoutMs: number) => void; + TTSEngine: string; + setTTSEngine: (TTSEngine: string) => void; + + ASREngine: string; + setASREngine: (ASREngine: string) => void; + showPersonaIcons: boolean; setShowPersonaIcons: (showPersonaIcons: boolean) => void; @@ -114,6 +121,12 @@ const useAppChatStore = create()(persist( micTimeoutMs: 2000, setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }), + TTSEngine: TTSEngineList[0], + setTTSEngine: (TTSEngine: string) => _set({ TTSEngine }), + + ASREngine: ASREngineList[0], + setASREngine: (ASREngine: string) => _set({ ASREngine }), + showPersonaIcons: true, setShowPersonaIcons: (showPersonaIcons: boolean) => _set({ showPersonaIcons }), @@ -198,6 +211,13 @@ export const useChatMicTimeoutMsValue = (): number => export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] => useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs])); +export const useTTSEngine = (): [string, (micTimeoutMs: string) => void] => + useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine])); +export const getTTSEngine = () => useAppChatStore.getState().TTSEngine; + +export const useASREngine = (): [string, (micTimeoutMs: string) => void] => + useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine])); + export const useChatDrawerFilters = () => { const values = useAppChatStore(useShallow(state => ({ filterHasDocFragments: state.filterHasDocFragments, diff --git a/src/apps/settings-modal/SettingsModal.tsx b/src/apps/settings-modal/SettingsModal.tsx index 3a5742e4c..2af374cac 100644 --- a/src/apps/settings-modal/SettingsModal.tsx +++ b/src/apps/settings-modal/SettingsModal.tsx @@ -22,6 +22,9 @@ import { AppChatSettingsAI } from './AppChatSettingsAI'; import { AppChatSettingsUI } from './settings-ui/AppChatSettingsUI'; import { UxLabsSettings } from './UxLabsSettings'; import { VoiceSettings } from './VoiceSettings'; +import { BrowserSpeechSettings } from '~/modules/browser/speech-synthesis/BrowserSpeechSettings'; + +import { useTTSEngine } from 'src/apps/chat/store-app-chat'; // styled into a Topics component @@ -122,6 +125,8 @@ export function SettingsModal(props: { // external state const isMobile = useIsMobile(); + const [TTSEngine] = useTTSEngine() + // handlers const { setTab } = props; @@ -193,9 +198,12 @@ export function SettingsModal(props: { - + {TTSEngine === 'Elevenlabs' && - + } + {TTSEngine === 'Web Speech API' && + + } diff --git a/src/apps/settings-modal/VoiceSettings.tsx b/src/apps/settings-modal/VoiceSettings.tsx index 404f15c59..97712ad2c 100644 --- a/src/apps/settings-modal/VoiceSettings.tsx +++ b/src/apps/settings-modal/VoiceSettings.tsx @@ -2,24 +2,25 @@ import * as React from 'react'; import { FormControl } from '@mui/joy'; -import { useChatAutoAI, useChatMicTimeoutMs } from '../chat/store-app-chat'; +import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat'; + -import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown'; import { FormLabelStart } from '~/common/components/forms/FormLabelStart'; import { FormRadioControl } from '~/common/components/forms/FormRadioControl'; import { LanguageSelect } from '~/common/components/LanguageSelect'; import { useIsMobile } from '~/common/components/useMatchMedia'; - +import { hasVoices, ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities'; export function VoiceSettings() { // external state const isMobile = useIsMobile(); const { autoSpeak, setAutoSpeak } = useChatAutoAI(); - const { hasVoices } = useElevenLabsVoices(); - const [chatTimeoutMs, setChatTimeoutMs] = useChatMicTimeoutMs(); + const [chatTimeoutMs, setChatTimeoutMs] = useChatMicTimeoutMs(); + const [TTSEngine, setTTSEngine ] = useTTSEngine(); + const [ASREngine, setASREngine ] = useASREngine(); // this converts from string keys to numbers and vice versa const chatTimeoutValue: string = '' + chatTimeoutMs; @@ -59,5 +60,21 @@ export function VoiceSettings() { value={autoSpeak} onChange={setAutoSpeak} /> + ({ value: i, label: i }))} + value={TTSEngine} onChange={setTTSEngine} + /> + + ({ value: i, label: i }))} + value={ASREngine} onChange={setASREngine} + /> + ; } \ No newline at end of file diff --git a/src/common/components/useCapabilities.ts b/src/common/components/useCapabilities.ts index 33a1be905..52b9facef 100644 --- a/src/common/components/useCapabilities.ts +++ b/src/common/components/useCapabilities.ts @@ -22,15 +22,15 @@ export interface CapabilityBrowserSpeechRecognition { export { browserSpeechRecognitionCapability as useCapabilityBrowserSpeechRecognition } from './useSpeechRecognition'; -/// Speech Synthesis: ElevenLabs +/// Speech Synthesis -export interface CapabilityElevenLabsSpeechSynthesis { +export interface CapabilitySpeechSynthesis { mayWork: boolean; isConfiguredServerSide: boolean; isConfiguredClientSide: boolean; } -export { useCapability as useCapabilityElevenLabs } from '~/modules/elevenlabs/elevenlabs.client'; +export { useCapability as useVoiceCapability } from '~/common/components/useVoiceCapabilities'; /// Image Generation diff --git a/src/common/components/useVoiceCapabilities.ts b/src/common/components/useVoiceCapabilities.ts new file mode 100644 index 000000000..ad1cc253a --- /dev/null +++ b/src/common/components/useVoiceCapabilities.ts @@ -0,0 +1,74 @@ +import { getTTSEngine } from 'src/apps/chat/store-app-chat'; +import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities'; + +import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client' +import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client' +import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client' + +import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' +import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' +import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' + +import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown'; +import { useBrowserSpeechVoices } from '~/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown'; + +export const TTSEngineList: string[] = [ + 'Elevenlabs', + 'Web Speech API' +] + +export const ASREngineList: string[] = [ + 'Web Speech API' +] + +export function getConditionalVoices(){ + const TTSEngine = getTTSEngine(); + if (TTSEngine === 'Elevenlabs') { + return useElevenLabsVoices + }else if (TTSEngine === 'Web Speech API') { + return useBrowserSpeechVoices + } + throw new Error('TTSEngine is not found'); +} + +export function hasVoices(): boolean { + console.log('getConditionalVoices', getConditionalVoices()().hasVoices) + return getConditionalVoices()().hasVoices; +} + +export function getConditionalCapability(): () => CapabilitySpeechSynthesis { + const TTSEngine = getTTSEngine(); + if (TTSEngine === 'Elevenlabs') { + return useElevenlabsCapability + }else if (TTSEngine === 'Web Speech API') { + return useBrowserSpeechSynthesisCapability + } + throw new Error('TTSEngine is not found'); +} + +export function useCapability(): CapabilitySpeechSynthesis { + return getConditionalCapability()(); +} + + +export async function speakText(text: string, voiceId?: string) { + const TTSEngine = getTTSEngine(); + if (TTSEngine === 'Elevenlabs') { + return await elevenlabsSpeakText(text, voiceId); + }else if (TTSEngine === 'Web Speech API') { + return await browserSpeechSynthesisSpeakText(text, voiceId); + } + throw new Error('TTSEngine is not found'); +} + +// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined; + +export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { + const TTSEngine = getTTSEngine(); + if (TTSEngine === 'Elevenlabs') { + return await EXPERIMENTAL_elevenlabsSpeakTextStream(text, voiceId); + }else if (TTSEngine === 'Web Speech API') { + return await EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream(text, voiceId); + } + throw new Error('TTSEngine is not found'); +} \ No newline at end of file diff --git a/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx b/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx new file mode 100644 index 000000000..4a6fc6441 --- /dev/null +++ b/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx @@ -0,0 +1,111 @@ +import * as React from 'react'; + +import { Option, FormControl, Select, Switch, Typography, Box, IconButton } from '@mui/joy'; +import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown'; +import CloseRounded from '@mui/icons-material/CloseRounded'; +import { addSnackbar } from '~/common/components/snackbar/useSnackbarsStore'; + +import { FormLabelStart } from '~/common/components/forms/FormLabelStart'; +import { useBrowserSpeechVoiceDropdown } from './useBrowserSpeechVoiceDropdown'; +import { useLanguageCodeForFilter } from './store-module-browser'; + +// languages are defined as a JSON file +import languages from './preSelect/Languages.json'; + +export function BrowserSpeechSettings() { + // state + const [testUtterance, setTestUtterance] = React.useState(null); + const [voiceNameFilters, setVoiceNameFilters] = React.useState(null); + + // external state + const [languageCode, setLanguageCode] = useLanguageCodeForFilter(); + + React.useEffect(() => { + if (languageCode) { + const fetchFunction = async () => { + let res = await fetch(`https://raw.githubusercontent.com/HadrienGardeur/web-speech-recommended-voices/refs/heads/main/json/${languageCode}.json`); + let data = await res.json(); + let voices = data.voices; + voices = voices.filter((voice: any) => { + return voice.quality.includes('high') || voice.quality.includes('veryHigh'); + }); + let voiceNameFilters = voices.map((voice: any) => voice.name); + setTestUtterance(data.testUtterance); + setVoiceNameFilters(voiceNameFilters); + }; + fetchFunction().catch((err) => { + console.log('Error getting voice list: ', err); + addSnackbar({ key: 'browser-speech-synthesis', message: 'Error getting voice list', type: 'issue' }); + setTestUtterance(null); + setVoiceNameFilters(null); + setLanguageCode(''); + }); + } else { + setTestUtterance(null); + setVoiceNameFilters(null); + } + }, [languageCode, setLanguageCode]); + + const { voicesDropdown } = useBrowserSpeechVoiceDropdown(true, { voiceNameFilters, testUtterance }); + + const languageOptions = React.useMemo(() => { + return Object.entries(languages) + .sort((a, b) => { + return a[1].localeCompare(b[1]); + }) + .map(([languageCode, languageName]) => ( + + )); + }, []); + + function handleLanguageChanged(_event: any, newValue: string | null) { + setLanguageCode(newValue || ''); + } + + return ( + <> + + + + + + + {voicesDropdown} + + + ); +} diff --git a/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts b/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts new file mode 100644 index 000000000..2814a760a --- /dev/null +++ b/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts @@ -0,0 +1,48 @@ +import { CapabilitySpeechSynthesis } from "~/common/components/useCapabilities"; +import { getBrowseVoiceId } from "./store-module-browser"; + +export function useCapability(): CapabilitySpeechSynthesis { + const synth = window.speechSynthesis; + const voices = synth.getVoices(); + const isConfiguredServerSide = false; + const isConfiguredClientSide = true; + const mayWork = voices.length > 0; + return { mayWork, isConfiguredServerSide, isConfiguredClientSide }; +} + + +export async function speakText(text: string, voiceId?: string) { + if (!(text?.trim())) return; + + try { + const synth = window.speechSynthesis; + const utterThis = new SpeechSynthesisUtterance(text); + const voices = synth.getVoices(); + voiceId = voiceId || getBrowseVoiceId(); + utterThis.voice = voices.find((voice) => voiceId === voice.name) || null; + synth.speak(utterThis); + } catch (error) { + console.error('Error playing first text:', error); + } +} + +export async function cancel() { + const synth = window.speechSynthesis; + synth.cancel(); +} + +export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { + if (!(text?.trim())) return; + + try { + const synth = window.speechSynthesis; + const utterThis = new SpeechSynthesisUtterance(text); + const voices = synth.getVoices(); + voiceId = voiceId || getBrowseVoiceId(); + utterThis.voice = voices.find((voice) => voiceId === voice.name) || null; + synth.speak(utterThis); + } catch (error) { + // has happened once in months of testing, not sure what was the cause + console.error('EXPERIMENTAL_speakTextStream:', error); + } +} \ No newline at end of file diff --git a/src/modules/browser/speech-synthesis/preSelect/Languages.json b/src/modules/browser/speech-synthesis/preSelect/Languages.json new file mode 100644 index 000000000..a2b9ade45 --- /dev/null +++ b/src/modules/browser/speech-synthesis/preSelect/Languages.json @@ -0,0 +1,75 @@ +{ + "ar": "Arabic", + "as": "Assamese", + "bg": "Bulgarian", + "bho": "Bhojpuri", + "bn": "Bangla", + "brx": "Bodo", + "bs": "Bosnian", + "ca": "Catalan", + "cmn": "Chinese", + "cs": "Czech", + "cy": "Welsh", + "da": "Danish", + "de": "German", + "doi": "Dogri", + "el": "Greek", + "en": "English", + "es": "Spanish", + "et": "Estonian", + "eu": "Basque", + "fa": "Persian", + "fi": "Finnish", + "fil": "Filipino", + "fr": "French", + "gl": "Galician", + "gu": "Gujarati", + "he": "Hebrew", + "hi": "Hindi", + "hr": "Croatian", + "hu": "Hungarian", + "id": "Indonesian", + "is": "Icelandic", + "it": "Italian", + "ja": "Japanese", + "jv": "Javanese", + "km": "khmer", + "kn": "Kannada", + "kok": "Konkani", + "ko": "Korean", + "lt": "Lithuanian", + "lv": "Latvia", + "mai": "Maithili", + "mal": "Malayalam", + "mni": "Manipuri", + "mr": "Marathi", + "ms": "Malay", + "nb": "Norwegian Bokmål", + "ne": "Nepali", + "nl": "Dutch", + "od": "Odia", + "pa": "Punjabi", + "pl": "Polish", + "pt": "Portuguese", + "ro": "Romanian", + "ru": "Russian", + "sa": "Sanskrit", + "sat": "Santali", + "sd": "Sindhi", + "si": "Sinhala", + "sk": "Slovak", + "sl": "Slovenian", + "sq": "Albanese", + "sr": "Serbian", + "su": "Sundanese", + "sv": "Swedish", + "sw": "Swahili", + "ta": "Tamil", + "te": "Telugu", + "th": "Thai", + "tr": "Turkish", + "uk": "Ukrainian", + "ur": "Urdu", + "vi": "Vietnamese", + "wuu": "Shanghainese" +} \ No newline at end of file diff --git a/src/modules/browser/speech-synthesis/store-module-browser.tsx b/src/modules/browser/speech-synthesis/store-module-browser.tsx new file mode 100644 index 000000000..434c9c359 --- /dev/null +++ b/src/modules/browser/speech-synthesis/store-module-browser.tsx @@ -0,0 +1,40 @@ +import { create } from 'zustand'; +import { persist } from 'zustand/middleware'; +import { useShallow } from 'zustand/react/shallow'; + +export type BrowsePageTransform = 'html' | 'text' | 'markdown'; + +interface BrowseState { + + languageCodeForFilter: string; + browseVoiceId: string; + setBrowseVoiceId: (value: string) => void; + setLanguageCodeForFilter: (value: string) => void; + +} + +export const useBrowseStore = create()( + persist( + (set) => ({ + languageCodeForFilter: '', + browseVoiceId: '', + setBrowseVoiceId: (browseVoiceId: string) => set(() => ({ browseVoiceId })), + setLanguageCodeForFilter: (languageCodeForFilter: string) => set(() => ({ languageCodeForFilter })), + }), + { + name: 'app-module-browse', + }, + ), +); + +export function useBrowseVoiceId(): [string, (value: string) => void] { + return useBrowseStore(useShallow(state => [state.browseVoiceId, state.setBrowseVoiceId])) +} + +export function useLanguageCodeForFilter(): [string, (value: string) => void] { + return useBrowseStore(useShallow(state => [state.languageCodeForFilter, state.setLanguageCodeForFilter])) +} + +export function getBrowseVoiceId() { + return useBrowseStore.getState().browseVoiceId +} \ No newline at end of file diff --git a/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx b/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx new file mode 100644 index 000000000..9db7d462e --- /dev/null +++ b/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx @@ -0,0 +1,124 @@ +import * as React from 'react'; + +import { CircularProgress, Option, Select } from '@mui/joy'; +import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown'; +import RecordVoiceOverTwoToneIcon from '@mui/icons-material/RecordVoiceOverTwoTone'; + +import { useBrowseVoiceId } from './store-module-browser'; +import { speakText, cancel } from './browser.speechSynthesis.client'; + +function VoicesDropdown(props: { + isValidKey: boolean; + isFetchingVoices: boolean; + isErrorVoices: boolean; + disabled?: boolean; + voices: SpeechSynthesisVoice[]; + voiceId: string; + setVoiceId: (voiceId: string) => void; +}) { + const handleVoiceChange = (_event: any, value: string | null) => props.setVoiceId(value === null ? '' : value); + + return ( + + ); +} + +function allVoicesObtained(): Promise { + return new Promise(function (resolve, reject) { + let voices = window.speechSynthesis.getVoices(); + if (voices.length !== 0) { + resolve(voices); + } else { + window.speechSynthesis.addEventListener('voiceschanged', function () { + voices = window.speechSynthesis.getVoices(); + resolve(voices); + }); + } + }); +} + +export function useBrowserSpeechVoices() { + const [voices, setVoices] = React.useState([]); + + React.useEffect(() => { + allVoicesObtained().then((data) => setVoices(data)); + }, []); + + return { + hasVoices: voices.length > 0, + voices: voices || [], + }; +} + +export function useBrowserSpeechVoiceDropdown( + autoSpeak: boolean, + { + disabled, + voiceNameFilters, + testUtterance, + }: { + disabled?: boolean; + voiceNameFilters?: string[] | null; + testUtterance?: string | null; + }, +) { + // external state + const { hasVoices, voices } = useBrowserSpeechVoices(); + const [voiceId, setVoiceId] = useBrowseVoiceId(); + + // derived state + const voice = voices.find((voice) => voiceId === voice.name); + const voiceFiltered = voiceNameFilters ? voices.filter((voice) => voiceNameFilters.includes(voice.name)) : voices; + + // [E] autoSpeak + React.useEffect(() => { + if (autoSpeak && voice && voiceFiltered.includes(voice)) { + speakText(testUtterance ? testUtterance.replace('{name}', voice.name) : `How can I assist you today?`, String(voiceId)); + } + return () => { + cancel(); + }; + }, [autoSpeak, testUtterance, voice, voiceFiltered, voiceId, voiceNameFilters]); + + const voicesDropdown = React.useMemo( + () => ( + + ), + [disabled, setVoiceId, voiceFiltered, voiceId], + ); + + return { + hasVoices, + voiceId, + voiceName: voice?.name, + voicesDropdown, + }; +} diff --git a/src/modules/elevenlabs/ElevenlabsSettings.tsx b/src/modules/elevenlabs/ElevenlabsSettings.tsx index 51b07db94..97ebc64b3 100644 --- a/src/modules/elevenlabs/ElevenlabsSettings.tsx +++ b/src/modules/elevenlabs/ElevenlabsSettings.tsx @@ -5,7 +5,7 @@ import { FormControl } from '@mui/joy'; import { AlreadySet } from '~/common/components/AlreadySet'; import { FormInputKey } from '~/common/components/forms/FormInputKey'; import { FormLabelStart } from '~/common/components/forms/FormLabelStart'; -import { useCapabilityElevenLabs } from '~/common/components/useCapabilities'; +import { useVoiceCapability } from '~/common/components/useCapabilities'; import { isElevenLabsEnabled } from './elevenlabs.client'; import { useElevenLabsVoiceDropdown } from './useElevenLabsVoiceDropdown'; @@ -16,7 +16,7 @@ export function ElevenlabsSettings() { // external state const [apiKey, setApiKey] = useElevenLabsApiKey(); - const { isConfiguredServerSide } = useCapabilityElevenLabs(); + const { isConfiguredServerSide } = useVoiceCapability(); const { voicesDropdown } = useElevenLabsVoiceDropdown(true); diff --git a/src/modules/elevenlabs/elevenlabs.client.ts b/src/modules/elevenlabs/elevenlabs.client.ts index 7145cbdb1..9e7e5ed09 100644 --- a/src/modules/elevenlabs/elevenlabs.client.ts +++ b/src/modules/elevenlabs/elevenlabs.client.ts @@ -2,7 +2,7 @@ import { getBackendCapabilities } from '~/modules/backend/store-backend-capabili import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer'; import { AudioPlayer } from '~/common/util/audio/AudioPlayer'; -import { CapabilityElevenLabsSpeechSynthesis } from '~/common/components/useCapabilities'; +import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities'; import { frontendSideFetch } from '~/common/util/clientFetchers'; import { useUIPreferencesStore } from '~/common/state/store-ui'; @@ -17,7 +17,7 @@ export const isElevenLabsEnabled = (apiKey?: string) => apiKey : getBackendCapabilities().hasVoiceElevenLabs; -export function useCapability(): CapabilityElevenLabsSpeechSynthesis { +export function useCapability(): CapabilitySpeechSynthesis { const [clientApiKey, voiceId] = useElevenLabsData(); const isConfiguredServerSide = getBackendCapabilities().hasVoiceElevenLabs; const isConfiguredClientSide = clientApiKey ? isValidElevenLabsApiKey(clientApiKey) : false; diff --git a/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx b/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx index fdfaafe3a..24de0b003 100644 --- a/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx +++ b/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx @@ -82,6 +82,10 @@ export function useElevenLabsVoiceDropdown(autoSpeak: boolean, disabled?: boolea React.useEffect(() => { if (previewUrl) void AudioPlayer.playUrl(previewUrl); + + return () => { + // TODO: stop audio + } }, [previewUrl]); const voicesDropdown = React.useMemo(() => From 0d5f6613404ff86ae02bc9092f91f4318ec02279 Mon Sep 17 00:00:00 2001 From: zoollcar Date: Thu, 24 Oct 2024 21:44:35 +0800 Subject: [PATCH 2/3] Add Types, Remove invalid languages --- src/apps/chat/store-app-chat.ts | 22 ++-- src/apps/settings-modal/SettingsModal.tsx | 4 +- src/apps/settings-modal/VoiceSettings.tsx | 123 +++++++++--------- src/common/components/useVoiceCapabilities.ts | 79 ++++++----- .../speech-synthesis/preSelect/Languages.json | 31 ----- 5 files changed, 123 insertions(+), 136 deletions(-) diff --git a/src/apps/chat/store-app-chat.ts b/src/apps/chat/store-app-chat.ts index 4760fe161..2a91157ab 100644 --- a/src/apps/chat/store-app-chat.ts +++ b/src/apps/chat/store-app-chat.ts @@ -1,7 +1,7 @@ import { create } from 'zustand'; import { persist } from 'zustand/middleware'; import { useShallow } from 'zustand/react/shallow'; -import { ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities'; +import { ASREngineKey, ASREngineList, TTSEngineKey, TTSEngineList } from '~/common/components/useVoiceCapabilities'; import type { DLLMId } from '~/common/stores/llms/llms.types'; @@ -52,11 +52,11 @@ interface AppChatStore { micTimeoutMs: number; setMicTimeoutMs: (micTimeoutMs: number) => void; - TTSEngine: string; - setTTSEngine: (TTSEngine: string) => void; + TTSEngine: TTSEngineKey; + setTTSEngine: (TTSEngine: TTSEngineKey) => void; - ASREngine: string; - setASREngine: (ASREngine: string) => void; + ASREngine: ASREngineKey; + setASREngine: (ASREngine: ASREngineKey) => void; showPersonaIcons: boolean; setShowPersonaIcons: (showPersonaIcons: boolean) => void; @@ -121,11 +121,11 @@ const useAppChatStore = create()(persist( micTimeoutMs: 2000, setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }), - TTSEngine: TTSEngineList[0], - setTTSEngine: (TTSEngine: string) => _set({ TTSEngine }), + TTSEngine: TTSEngineList[0].key, + setTTSEngine: (TTSEngine: TTSEngineKey) => _set({ TTSEngine }), - ASREngine: ASREngineList[0], - setASREngine: (ASREngine: string) => _set({ ASREngine }), + ASREngine: ASREngineList[0].key, + setASREngine: (ASREngine: ASREngineKey) => _set({ ASREngine }), showPersonaIcons: true, setShowPersonaIcons: (showPersonaIcons: boolean) => _set({ showPersonaIcons }), @@ -211,11 +211,11 @@ export const useChatMicTimeoutMsValue = (): number => export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] => useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs])); -export const useTTSEngine = (): [string, (micTimeoutMs: string) => void] => +export const useTTSEngine = (): [TTSEngineKey, (TTSEngine: TTSEngineKey) => void] => useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine])); export const getTTSEngine = () => useAppChatStore.getState().TTSEngine; -export const useASREngine = (): [string, (micTimeoutMs: string) => void] => +export const useASREngine = (): [ASREngineKey, (ASREngine: ASREngineKey) => void] => useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine])); export const useChatDrawerFilters = () => { diff --git a/src/apps/settings-modal/SettingsModal.tsx b/src/apps/settings-modal/SettingsModal.tsx index 2af374cac..feeccb050 100644 --- a/src/apps/settings-modal/SettingsModal.tsx +++ b/src/apps/settings-modal/SettingsModal.tsx @@ -198,10 +198,10 @@ export function SettingsModal(props: { - {TTSEngine === 'Elevenlabs' && + {TTSEngine === 'elevenlabs' && } - {TTSEngine === 'Web Speech API' && + {TTSEngine === 'webspeech' && } diff --git a/src/apps/settings-modal/VoiceSettings.tsx b/src/apps/settings-modal/VoiceSettings.tsx index 97712ad2c..fcc9725d3 100644 --- a/src/apps/settings-modal/VoiceSettings.tsx +++ b/src/apps/settings-modal/VoiceSettings.tsx @@ -4,77 +4,82 @@ import { FormControl } from '@mui/joy'; import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat'; - - import { FormLabelStart } from '~/common/components/forms/FormLabelStart'; import { FormRadioControl } from '~/common/components/forms/FormRadioControl'; import { LanguageSelect } from '~/common/components/LanguageSelect'; import { useIsMobile } from '~/common/components/useMatchMedia'; -import { hasVoices, ASREngineList, TTSEngineList } from '~/common/components/useVoiceCapabilities'; +import { hasVoices, ASREngineList, TTSEngineList, TTSEngineKey } from '~/common/components/useVoiceCapabilities'; export function VoiceSettings() { - // external state const isMobile = useIsMobile(); const { autoSpeak, setAutoSpeak } = useChatAutoAI(); - const [chatTimeoutMs, setChatTimeoutMs] = useChatMicTimeoutMs(); - const [TTSEngine, setTTSEngine ] = useTTSEngine(); - const [ASREngine, setASREngine ] = useASREngine(); + const [chatTimeoutMs, setChatTimeoutMs] = useChatMicTimeoutMs(); + const [TTSEngine, setTTSEngine] = useTTSEngine(); + const [ASREngine, setASREngine] = useASREngine(); // this converts from string keys to numbers and vice versa const chatTimeoutValue: string = '' + chatTimeoutMs; const setChatTimeoutValue = (value: string) => value && setChatTimeoutMs(parseInt(value)); - return <> - - {/* LanguageSelect: moved from the UI settings (where it logically belongs), just to group things better from an UX perspective */} - - - - - - {!isMobile && 5000 ? 'Best for thinking' : 'Standard'} - options={[ - { value: '600', label: '.6s' }, - { value: '2000', label: '2s' }, - { value: '15000', label: '15s' }, - ]} - value={chatTimeoutValue} onChange={setChatTimeoutValue} - />} - - - - ({ value: i, label: i }))} - value={TTSEngine} onChange={setTTSEngine} - /> - - ({ value: i, label: i }))} - value={ASREngine} onChange={setASREngine} - /> - - ; -} \ No newline at end of file + return ( + <> + {/* LanguageSelect: moved from the UI settings (where it logically belongs), just to group things better from an UX perspective */} + + + + + + {!isMobile && ( + 5000 ? 'Best for thinking' : 'Standard'} + options={[ + { value: '600', label: '.6s' }, + { value: '2000', label: '2s' }, + { value: '15000', label: '15s' }, + ]} + value={chatTimeoutValue} + onChange={setChatTimeoutValue} + /> + )} + + + + ({ value: i.key, label: i.label }))} + value={TTSEngine} + onChange={setTTSEngine} + /> + + ({ value: i.key, label: i.label }))} + value={ASREngine} + onChange={setASREngine} + /> + + ); +} diff --git a/src/common/components/useVoiceCapabilities.ts b/src/common/components/useVoiceCapabilities.ts index ad1cc253a..ecfc37442 100644 --- a/src/common/components/useVoiceCapabilities.ts +++ b/src/common/components/useVoiceCapabilities.ts @@ -1,47 +1,60 @@ import { getTTSEngine } from 'src/apps/chat/store-app-chat'; import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities'; -import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client' -import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client' -import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client' +import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client'; +import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client'; +import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client'; -import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' -import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' -import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client' +import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'; +import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'; +import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'; import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown'; import { useBrowserSpeechVoices } from '~/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown'; -export const TTSEngineList: string[] = [ - 'Elevenlabs', - 'Web Speech API' -] +export type TTSEngineKey = 'elevenlabs' | 'webspeech'; +export type ASREngineKey = 'webspeech'; -export const ASREngineList: string[] = [ - 'Web Speech API' -] +export const TTSEngineList: { key: TTSEngineKey; label: string }[] = [ + { + key: 'elevenlabs', + label: 'ElevenLabs', + }, + { + key: 'webspeech', + label: 'Web Speech API', + }, +]; -export function getConditionalVoices(){ +export const ASREngineList: { key: ASREngineKey; label: string }[] = [ + { + key: 'webspeech', + label: 'Web Speech API', + }, +]; + +export function getConditionalVoices() { const TTSEngine = getTTSEngine(); - if (TTSEngine === 'Elevenlabs') { - return useElevenLabsVoices - }else if (TTSEngine === 'Web Speech API') { - return useBrowserSpeechVoices + if (TTSEngine === 'elevenlabs') { + return useElevenLabsVoices; + } + if (TTSEngine === 'webspeech') { + return useBrowserSpeechVoices; } - throw new Error('TTSEngine is not found'); } export function hasVoices(): boolean { - console.log('getConditionalVoices', getConditionalVoices()().hasVoices) + console.log('getConditionalVoices', getConditionalVoices()().hasVoices); return getConditionalVoices()().hasVoices; -} +} export function getConditionalCapability(): () => CapabilitySpeechSynthesis { const TTSEngine = getTTSEngine(); - if (TTSEngine === 'Elevenlabs') { - return useElevenlabsCapability - }else if (TTSEngine === 'Web Speech API') { - return useBrowserSpeechSynthesisCapability + if (TTSEngine === 'elevenlabs') { + return useElevenlabsCapability; + } + if (TTSEngine === 'webspeech') { + return useBrowserSpeechSynthesisCapability; } throw new Error('TTSEngine is not found'); } @@ -50,25 +63,25 @@ export function useCapability(): CapabilitySpeechSynthesis { return getConditionalCapability()(); } - export async function speakText(text: string, voiceId?: string) { const TTSEngine = getTTSEngine(); - if (TTSEngine === 'Elevenlabs') { + if (TTSEngine === 'elevenlabs') { return await elevenlabsSpeakText(text, voiceId); - }else if (TTSEngine === 'Web Speech API') { + } + if (TTSEngine === 'webspeech') { return await browserSpeechSynthesisSpeakText(text, voiceId); } - throw new Error('TTSEngine is not found'); } // let liveAudioPlayer: LiveAudioPlayer | undefined = undefined; export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { const TTSEngine = getTTSEngine(); - if (TTSEngine === 'Elevenlabs') { + if (TTSEngine === 'elevenlabs') { return await EXPERIMENTAL_elevenlabsSpeakTextStream(text, voiceId); - }else if (TTSEngine === 'Web Speech API') { + } + if (TTSEngine === 'webspeech') { return await EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream(text, voiceId); } - throw new Error('TTSEngine is not found'); -} \ No newline at end of file + throw new Error('TTSEngine is not found'); +} diff --git a/src/modules/browser/speech-synthesis/preSelect/Languages.json b/src/modules/browser/speech-synthesis/preSelect/Languages.json index a2b9ade45..69c3edec8 100644 --- a/src/modules/browser/speech-synthesis/preSelect/Languages.json +++ b/src/modules/browser/speech-synthesis/preSelect/Languages.json @@ -1,75 +1,44 @@ { "ar": "Arabic", - "as": "Assamese", - "bg": "Bulgarian", "bho": "Bhojpuri", "bn": "Bangla", - "brx": "Bodo", - "bs": "Bosnian", "ca": "Catalan", "cmn": "Chinese", "cs": "Czech", - "cy": "Welsh", "da": "Danish", "de": "German", - "doi": "Dogri", "el": "Greek", "en": "English", "es": "Spanish", - "et": "Estonian", "eu": "Basque", "fa": "Persian", "fi": "Finnish", - "fil": "Filipino", "fr": "French", "gl": "Galician", - "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "id": "Indonesian", - "is": "Icelandic", "it": "Italian", "ja": "Japanese", - "jv": "Javanese", - "km": "khmer", - "kn": "Kannada", - "kok": "Konkani", "ko": "Korean", - "lt": "Lithuanian", - "lv": "Latvia", - "mai": "Maithili", - "mal": "Malayalam", - "mni": "Manipuri", "mr": "Marathi", "ms": "Malay", "nb": "Norwegian Bokmål", - "ne": "Nepali", "nl": "Dutch", - "od": "Odia", - "pa": "Punjabi", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", - "sa": "Sanskrit", - "sat": "Santali", - "sd": "Sindhi", - "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", - "sq": "Albanese", - "sr": "Serbian", - "su": "Sundanese", "sv": "Swedish", - "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tr": "Turkish", "uk": "Ukrainian", - "ur": "Urdu", "vi": "Vietnamese", "wuu": "Shanghainese" } \ No newline at end of file From a538cc195a19924d48fc953707b1a1d39e93fdb1 Mon Sep 17 00:00:00 2001 From: zoollcar Date: Fri, 25 Oct 2024 23:22:51 +0800 Subject: [PATCH 3/3] Abstract TTS module --- app/api/elevenlabs/speech/route.ts | 2 +- pages/info/debug.tsx | 5 +- src/apps/call/CallWizard.tsx | 6 +- src/apps/call/Telephone.tsx | 14 +-- src/apps/chat/AppChat.tsx | 2 +- src/apps/chat/components/ChatMessageList.tsx | 5 +- .../persona/PersonaChatMessageSpeak.ts | 2 +- src/apps/chat/store-app-chat.ts | 12 +- src/apps/settings-modal/SettingsModal.tsx | 16 +-- src/apps/settings-modal/VoiceSettings.tsx | 74 ++++++++---- src/common/components/useCapabilities.ts | 12 -- src/common/components/useVoiceCapabilities.ts | 87 -------------- src/modules/asr/asr.client.ts | 8 ++ .../browser.speechSynthesis.client.ts | 48 -------- src/modules/elevenlabs/elevenlabs.client.ts | 98 ---------------- src/modules/tts/tts.client.hooks.ts | 11 ++ src/modules/tts/tts.client.ts | 41 +++++++ src/modules/tts/tts.setting.tsx | 11 ++ src/modules/tts/useTTSStore.ts | 34 ++++++ src/modules/tts/vendors/ISpeechSynthesis.ts | 30 +++++ .../elevenlabs/ElevenlabsSettings.tsx | 5 +- .../vendors}/elevenlabs/elevenlabs.router.ts | 0 .../vendors}/elevenlabs/elevenlabs.server.ts | 0 .../vendors/elevenlabs/elevenlabs.vendor.ts | 107 ++++++++++++++++++ .../elevenlabs/store-module-elevenlabs.ts | 0 .../elevenlabs/useElevenLabsVoiceDropdown.tsx | 2 +- src/modules/tts/vendors/vendors.registry.ts | 19 ++++ .../vendors/webspeech/WebspeechSettings.tsx} | 6 +- .../webspeech}/preSelect/Languages.json | 0 .../webspeech/store-module-webspeech.ts} | 0 .../webspeech/useWebspeechVoiceDropdown.tsx} | 4 +- .../tts/vendors/webspeech/webspeech.vendor.ts | 65 +++++++++++ src/server/api/trpc.router-edge.ts | 2 +- 33 files changed, 417 insertions(+), 311 deletions(-) delete mode 100644 src/common/components/useVoiceCapabilities.ts create mode 100644 src/modules/asr/asr.client.ts delete mode 100644 src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts delete mode 100644 src/modules/elevenlabs/elevenlabs.client.ts create mode 100644 src/modules/tts/tts.client.hooks.ts create mode 100644 src/modules/tts/tts.client.ts create mode 100644 src/modules/tts/tts.setting.tsx create mode 100644 src/modules/tts/useTTSStore.ts create mode 100644 src/modules/tts/vendors/ISpeechSynthesis.ts rename src/modules/{ => tts/vendors}/elevenlabs/ElevenlabsSettings.tsx (87%) rename src/modules/{ => tts/vendors}/elevenlabs/elevenlabs.router.ts (100%) rename src/modules/{ => tts/vendors}/elevenlabs/elevenlabs.server.ts (100%) create mode 100644 src/modules/tts/vendors/elevenlabs/elevenlabs.vendor.ts rename src/modules/{ => tts/vendors}/elevenlabs/store-module-elevenlabs.ts (100%) rename src/modules/{ => tts/vendors}/elevenlabs/useElevenLabsVoiceDropdown.tsx (98%) create mode 100644 src/modules/tts/vendors/vendors.registry.ts rename src/modules/{browser/speech-synthesis/BrowserSpeechSettings.tsx => tts/vendors/webspeech/WebspeechSettings.tsx} (95%) rename src/modules/{browser/speech-synthesis => tts/vendors/webspeech}/preSelect/Languages.json (100%) rename src/modules/{browser/speech-synthesis/store-module-browser.tsx => tts/vendors/webspeech/store-module-webspeech.ts} (100%) rename src/modules/{browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx => tts/vendors/webspeech/useWebspeechVoiceDropdown.tsx} (96%) create mode 100644 src/modules/tts/vendors/webspeech/webspeech.vendor.ts diff --git a/app/api/elevenlabs/speech/route.ts b/app/api/elevenlabs/speech/route.ts index d7a8d6e7b..7a9f21678 100644 --- a/app/api/elevenlabs/speech/route.ts +++ b/app/api/elevenlabs/speech/route.ts @@ -1,2 +1,2 @@ export const runtime = 'edge'; -export { elevenLabsHandler as POST } from '~/modules/elevenlabs/elevenlabs.server'; \ No newline at end of file +export { elevenLabsHandler as POST } from '~/modules/tts/vendors/elevenlabs/elevenlabs.server'; \ No newline at end of file diff --git a/pages/info/debug.tsx b/pages/info/debug.tsx index 5d390020d..8b27709b9 100644 --- a/pages/info/debug.tsx +++ b/pages/info/debug.tsx @@ -18,7 +18,8 @@ import { ROUTE_APP_CHAT, ROUTE_INDEX } from '~/common/app.routes'; import { Release } from '~/common/app.release'; // capabilities access -import { useCapabilityBrowserSpeechRecognition, useVoiceCapability, useCapabilityTextToImage } from '~/common/components/useCapabilities'; +import { useCapabilityBrowserSpeechRecognition, useCapabilityTextToImage } from '~/common/components/useCapabilities'; +import { useTTSCapability } from '~/modules/tts/tts.client.hooks'; // stores access import { getLLMsDebugInfo } from '~/common/stores/llms/store-llms'; @@ -95,7 +96,7 @@ function AppDebug() { const cProduct = { capabilities: { mic: useCapabilityBrowserSpeechRecognition(), - elevenLabs: useVoiceCapability(), + elevenLabs: useTTSCapability(), textToImage: useCapabilityTextToImage(), }, models: getLLMsDebugInfo(), diff --git a/src/apps/call/CallWizard.tsx b/src/apps/call/CallWizard.tsx index d7bdb767a..190f76ab4 100644 --- a/src/apps/call/CallWizard.tsx +++ b/src/apps/call/CallWizard.tsx @@ -12,11 +12,13 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded'; import { animationColorRainbow } from '~/common/util/animUtils'; import { navigateBack } from '~/common/app.routes'; import { optimaOpenPreferences } from '~/common/layout/optima/useOptima'; -import { useCapabilityBrowserSpeechRecognition, useVoiceCapability } from '~/common/components/useCapabilities'; +import { useCapabilityBrowserSpeechRecognition } from '~/common/components/useCapabilities'; +import { useTTSCapability } from '~/modules/tts/tts.client.hooks'; import { useChatStore } from '~/common/stores/chat/store-chats'; import { useUICounter } from '~/common/state/store-ui'; + function StatusCard(props: { icon: React.JSX.Element, hasIssue: boolean, text: string, button?: React.JSX.Element }) { return ( @@ -45,7 +47,7 @@ export function CallWizard(props: { strict?: boolean, conversationId: string | n // external state const recognition = useCapabilityBrowserSpeechRecognition(); - const synthesis = useVoiceCapability(); + const synthesis = useTTSCapability(); const chatIsEmpty = useChatStore(state => { if (!props.conversationId) return false; diff --git a/src/apps/call/Telephone.tsx b/src/apps/call/Telephone.tsx index d3ffddd92..72c728515 100644 --- a/src/apps/call/Telephone.tsx +++ b/src/apps/call/Telephone.tsx @@ -13,10 +13,10 @@ import { ScrollToBottom } from '~/common/scroll-to-bottom/ScrollToBottom'; import { ScrollToBottomButton } from '~/common/scroll-to-bottom/ScrollToBottomButton'; import { useChatLLMDropdown } from '../chat/components/layout-bar/useLLMDropdown'; -import { EXPERIMENTAL_speakTextStream } from '~/common/components/useVoiceCapabilities'; +import { EXPERIMENTAL_speakTextStream } from '~/modules/tts/tts.client'; import { SystemPurposeId, SystemPurposes } from '../../data'; import { llmStreamingChatGenerate, VChatMessageIn } from '~/modules/llms/llm.client'; -import { useElevenLabsVoiceDropdown } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown'; +import { TTSSetting } from '~/modules/tts/tts.setting'; import type { OptimaBarControlMethods } from '~/common/layout/optima/bar/OptimaBarDropdown'; import { AudioPlayer } from '~/common/util/audio/AudioPlayer'; @@ -39,6 +39,7 @@ import { CallStatus } from './components/CallStatus'; import { useAppCallStore } from './state/store-app-call'; + function CallMenuItems(props: { pushToTalk: boolean, setPushToTalk: (pushToTalk: boolean) => void, @@ -48,8 +49,7 @@ function CallMenuItems(props: { // external state const { grayUI, toggleGrayUI } = useAppCallStore(); - const { voicesDropdown } = useElevenLabsVoiceDropdown(false, !props.override); - + const handlePushToTalkToggle = () => props.setPushToTalk(!props.pushToTalk); const handleChangeVoiceToggle = () => props.setOverride(!props.override); @@ -68,10 +68,10 @@ function CallMenuItems(props: { - - {' '} - {voicesDropdown} + + + diff --git a/src/apps/chat/AppChat.tsx b/src/apps/chat/AppChat.tsx index 98f4837da..a21dd584e 100644 --- a/src/apps/chat/AppChat.tsx +++ b/src/apps/chat/AppChat.tsx @@ -10,7 +10,7 @@ import { FlattenerModal } from '~/modules/aifn/flatten/FlattenerModal'; import { TradeConfig, TradeModal } from '~/modules/trade/TradeModal'; import { downloadSingleChat, importConversationsFromFilesAtRest, openConversationsAtRestPicker } from '~/modules/trade/trade.client'; import { imaginePromptFromTextOrThrow } from '~/modules/aifn/imagine/imaginePromptFromText'; -import { speakText } from '~/common/components/useVoiceCapabilities'; +import { speakText } from '~/modules/tts/tts.client'; import { useAreBeamsOpen } from '~/modules/beam/store-beam.hooks'; import { useCapabilityTextToImage } from '~/modules/t2i/t2i.client'; diff --git a/src/apps/chat/components/ChatMessageList.tsx b/src/apps/chat/components/ChatMessageList.tsx index 22f62f9c9..2b0df25fe 100644 --- a/src/apps/chat/components/ChatMessageList.tsx +++ b/src/apps/chat/components/ChatMessageList.tsx @@ -19,7 +19,7 @@ import { getConversation, useChatStore } from '~/common/stores/chat/store-chats' import { openFileForAttaching } from '~/common/components/ButtonAttachFiles'; import { optimaOpenPreferences } from '~/common/layout/optima/useOptima'; import { useBrowserTranslationWarning } from '~/common/components/useIsBrowserTranslating'; -import { useVoiceCapability } from '~/common/components/useCapabilities'; +import { useTTSCapability } from '~/modules/tts/tts.client.hooks'; import { useChatOverlayStore } from '~/common/chat-overlay/store-perchat_vanilla'; import { useScrollToBottom } from '~/common/scroll-to-bottom/useScrollToBottom'; @@ -30,6 +30,7 @@ import { PersonaSelector } from './persona-selector/PersonaSelector'; import { useChatAutoSuggestHTMLUI, useChatShowSystemMessages } from '../store-app-chat'; + const stableNoMessages: DMessage[] = []; /** @@ -75,7 +76,7 @@ export function ChatMessageList(props: { _composerInReferenceToCount: state.inReferenceTo?.length ?? 0, ephemerals: state.ephemerals?.length ? state.ephemerals : null, }))); - const { mayWork: isSpeakable } = useVoiceCapability(); + const { mayWork: isSpeakable } = useTTSCapability(); // derived state const { conversationHandler, conversationId, capabilityHasT2I, onConversationBranch, onConversationExecuteHistory, onTextDiagram, onTextImagine, onTextSpeak } = props; diff --git a/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts b/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts index a016af1da..6c753f771 100644 --- a/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts +++ b/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts @@ -1,4 +1,4 @@ -import { speakText } from '~/modules/elevenlabs/elevenlabs.client'; +import { speakText } from '~/modules/tts/tts.client'; import { isTextContentFragment } from '~/common/stores/chat/chat.fragments'; diff --git a/src/apps/chat/store-app-chat.ts b/src/apps/chat/store-app-chat.ts index 2a91157ab..8a723af0c 100644 --- a/src/apps/chat/store-app-chat.ts +++ b/src/apps/chat/store-app-chat.ts @@ -1,9 +1,9 @@ import { create } from 'zustand'; import { persist } from 'zustand/middleware'; import { useShallow } from 'zustand/react/shallow'; -import { ASREngineKey, ASREngineList, TTSEngineKey, TTSEngineList } from '~/common/components/useVoiceCapabilities'; import type { DLLMId } from '~/common/stores/llms/llms.types'; +import { ASREngineKey, ASREngineList } from '~/modules/asr/asr.client'; export type ChatAutoSpeakType = 'off' | 'firstLine' | 'all'; @@ -52,9 +52,6 @@ interface AppChatStore { micTimeoutMs: number; setMicTimeoutMs: (micTimeoutMs: number) => void; - TTSEngine: TTSEngineKey; - setTTSEngine: (TTSEngine: TTSEngineKey) => void; - ASREngine: ASREngineKey; setASREngine: (ASREngine: ASREngineKey) => void; @@ -121,9 +118,6 @@ const useAppChatStore = create()(persist( micTimeoutMs: 2000, setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }), - TTSEngine: TTSEngineList[0].key, - setTTSEngine: (TTSEngine: TTSEngineKey) => _set({ TTSEngine }), - ASREngine: ASREngineList[0].key, setASREngine: (ASREngine: ASREngineKey) => _set({ ASREngine }), @@ -211,10 +205,6 @@ export const useChatMicTimeoutMsValue = (): number => export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] => useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs])); -export const useTTSEngine = (): [TTSEngineKey, (TTSEngine: TTSEngineKey) => void] => - useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine])); -export const getTTSEngine = () => useAppChatStore.getState().TTSEngine; - export const useASREngine = (): [ASREngineKey, (ASREngine: ASREngineKey) => void] => useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine])); diff --git a/src/apps/settings-modal/SettingsModal.tsx b/src/apps/settings-modal/SettingsModal.tsx index 3dfe68256..fbecf1822 100644 --- a/src/apps/settings-modal/SettingsModal.tsx +++ b/src/apps/settings-modal/SettingsModal.tsx @@ -9,7 +9,6 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded'; import { BrowseSettings } from '~/modules/browse/BrowseSettings'; import { DallESettings } from '~/modules/t2i/dalle/DallESettings'; -import { ElevenlabsSettings } from '~/modules/elevenlabs/ElevenlabsSettings'; import { GoogleSearchSettings } from '~/modules/google/GoogleSearchSettings'; import { ProdiaSettings } from '~/modules/t2i/prodia/ProdiaSettings'; import { T2ISettings } from '~/modules/t2i/T2ISettings'; @@ -22,9 +21,9 @@ import { AppChatSettingsAI } from './AppChatSettingsAI'; import { AppChatSettingsUI } from './settings-ui/AppChatSettingsUI'; import { UxLabsSettings } from './UxLabsSettings'; import { VoiceSettings } from './VoiceSettings'; -import { BrowserSpeechSettings } from '~/modules/browser/speech-synthesis/BrowserSpeechSettings'; - -import { useTTSEngine } from 'src/apps/chat/store-app-chat'; +import { useTTSEngine } from '~/modules/tts/useTTSStore'; +import { TTSSetting } from '~/modules/tts/tts.setting'; +import { getName as getTTSEngineName } from '~/modules/tts/tts.client'; // styled into a Topics component @@ -198,12 +197,9 @@ export function SettingsModal(props: { - {TTSEngine === 'elevenlabs' && - - } - {TTSEngine === 'webspeech' && - - } + + + diff --git a/src/apps/settings-modal/VoiceSettings.tsx b/src/apps/settings-modal/VoiceSettings.tsx index fcc9725d3..f33658a57 100644 --- a/src/apps/settings-modal/VoiceSettings.tsx +++ b/src/apps/settings-modal/VoiceSettings.tsx @@ -1,14 +1,17 @@ import * as React from 'react'; -import { FormControl } from '@mui/joy'; +import { FormControl, Option, Select } from '@mui/joy'; +import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown'; -import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat'; +import { useASREngine, useChatAutoAI, useChatMicTimeoutMs } from '../chat/store-app-chat'; import { FormLabelStart } from '~/common/components/forms/FormLabelStart'; import { FormRadioControl } from '~/common/components/forms/FormRadioControl'; import { LanguageSelect } from '~/common/components/LanguageSelect'; import { useIsMobile } from '~/common/components/useMatchMedia'; -import { hasVoices, ASREngineList, TTSEngineList, TTSEngineKey } from '~/common/components/useVoiceCapabilities'; +import { ASREngineKey, ASREngineList } from '~/modules/asr/asr.client'; +import { TTSEngineKey, TTSEngineList, useTTSEngine } from '~/modules/tts/useTTSStore'; +import { useTTSCapability } from '~/modules/tts/tts.client.hooks'; export function VoiceSettings() { // external state @@ -23,6 +26,18 @@ export function VoiceSettings() { const chatTimeoutValue: string = '' + chatTimeoutMs; const setChatTimeoutValue = (value: string) => value && setChatTimeoutMs(parseInt(value)); + const { mayWork: hasVoices } = useTTSCapability(); + + const handleTTSChanged = (_event: any, newValue: TTSEngineKey | null) => { + if (!newValue) return; + setTTSEngine(newValue); + }; + + const handleASRChanged = (_event: any, newValue: ASREngineKey | null) => { + if (!newValue) return; + setASREngine(newValue); + }; + return ( <> {/* LanguageSelect: moved from the UI settings (where it logically belongs), just to group things better from an UX perspective */} @@ -63,23 +78,44 @@ export function VoiceSettings() { onChange={setAutoSpeak} /> - ({ value: i.key, label: i.label }))} - value={TTSEngine} - onChange={setTTSEngine} - /> + + - ({ value: i.key, label: i.label }))} - value={ASREngine} - onChange={setASREngine} - /> + + + + + + + ); } diff --git a/src/common/components/useCapabilities.ts b/src/common/components/useCapabilities.ts index 2d2effea2..59b0c51b0 100644 --- a/src/common/components/useCapabilities.ts +++ b/src/common/components/useCapabilities.ts @@ -21,18 +21,6 @@ export interface CapabilityBrowserSpeechRecognition { export { browserSpeechRecognitionCapability as useCapabilityBrowserSpeechRecognition } from './speechrecognition/useSpeechRecognition'; - -/// Speech Synthesis - -export interface CapabilitySpeechSynthesis { - mayWork: boolean; - isConfiguredServerSide: boolean; - isConfiguredClientSide: boolean; -} - -export { useCapability as useVoiceCapability } from '~/common/components/useVoiceCapabilities'; - - /// Image Generation export interface TextToImageProvider { diff --git a/src/common/components/useVoiceCapabilities.ts b/src/common/components/useVoiceCapabilities.ts deleted file mode 100644 index ecfc37442..000000000 --- a/src/common/components/useVoiceCapabilities.ts +++ /dev/null @@ -1,87 +0,0 @@ -import { getTTSEngine } from 'src/apps/chat/store-app-chat'; -import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities'; - -import { useCapability as useElevenlabsCapability } from '~/modules/elevenlabs/elevenlabs.client'; -import { speakText as elevenlabsSpeakText } from '~/modules/elevenlabs/elevenlabs.client'; -import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_elevenlabsSpeakTextStream } from '~/modules/elevenlabs/elevenlabs.client'; - -import { useCapability as useBrowserSpeechSynthesisCapability } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'; -import { speakText as browserSpeechSynthesisSpeakText } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'; -import { EXPERIMENTAL_speakTextStream as EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream } from '~/modules/browser/speech-synthesis/browser.speechSynthesis.client'; - -import { useElevenLabsVoices } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown'; -import { useBrowserSpeechVoices } from '~/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown'; - -export type TTSEngineKey = 'elevenlabs' | 'webspeech'; -export type ASREngineKey = 'webspeech'; - -export const TTSEngineList: { key: TTSEngineKey; label: string }[] = [ - { - key: 'elevenlabs', - label: 'ElevenLabs', - }, - { - key: 'webspeech', - label: 'Web Speech API', - }, -]; - -export const ASREngineList: { key: ASREngineKey; label: string }[] = [ - { - key: 'webspeech', - label: 'Web Speech API', - }, -]; - -export function getConditionalVoices() { - const TTSEngine = getTTSEngine(); - if (TTSEngine === 'elevenlabs') { - return useElevenLabsVoices; - } - if (TTSEngine === 'webspeech') { - return useBrowserSpeechVoices; - } -} - -export function hasVoices(): boolean { - console.log('getConditionalVoices', getConditionalVoices()().hasVoices); - return getConditionalVoices()().hasVoices; -} - -export function getConditionalCapability(): () => CapabilitySpeechSynthesis { - const TTSEngine = getTTSEngine(); - if (TTSEngine === 'elevenlabs') { - return useElevenlabsCapability; - } - if (TTSEngine === 'webspeech') { - return useBrowserSpeechSynthesisCapability; - } - throw new Error('TTSEngine is not found'); -} - -export function useCapability(): CapabilitySpeechSynthesis { - return getConditionalCapability()(); -} - -export async function speakText(text: string, voiceId?: string) { - const TTSEngine = getTTSEngine(); - if (TTSEngine === 'elevenlabs') { - return await elevenlabsSpeakText(text, voiceId); - } - if (TTSEngine === 'webspeech') { - return await browserSpeechSynthesisSpeakText(text, voiceId); - } -} - -// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined; - -export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { - const TTSEngine = getTTSEngine(); - if (TTSEngine === 'elevenlabs') { - return await EXPERIMENTAL_elevenlabsSpeakTextStream(text, voiceId); - } - if (TTSEngine === 'webspeech') { - return await EXPERIMENTAL_browserSpeechSynthesisSpeakTextStream(text, voiceId); - } - throw new Error('TTSEngine is not found'); -} diff --git a/src/modules/asr/asr.client.ts b/src/modules/asr/asr.client.ts new file mode 100644 index 000000000..30db9cf25 --- /dev/null +++ b/src/modules/asr/asr.client.ts @@ -0,0 +1,8 @@ +export type ASREngineKey = 'webspeech'; + +export const ASREngineList: { key: ASREngineKey; label: string }[] = [ + { + key: 'webspeech', + label: 'Web Speech API', + }, +]; diff --git a/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts b/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts deleted file mode 100644 index 2814a760a..000000000 --- a/src/modules/browser/speech-synthesis/browser.speechSynthesis.client.ts +++ /dev/null @@ -1,48 +0,0 @@ -import { CapabilitySpeechSynthesis } from "~/common/components/useCapabilities"; -import { getBrowseVoiceId } from "./store-module-browser"; - -export function useCapability(): CapabilitySpeechSynthesis { - const synth = window.speechSynthesis; - const voices = synth.getVoices(); - const isConfiguredServerSide = false; - const isConfiguredClientSide = true; - const mayWork = voices.length > 0; - return { mayWork, isConfiguredServerSide, isConfiguredClientSide }; -} - - -export async function speakText(text: string, voiceId?: string) { - if (!(text?.trim())) return; - - try { - const synth = window.speechSynthesis; - const utterThis = new SpeechSynthesisUtterance(text); - const voices = synth.getVoices(); - voiceId = voiceId || getBrowseVoiceId(); - utterThis.voice = voices.find((voice) => voiceId === voice.name) || null; - synth.speak(utterThis); - } catch (error) { - console.error('Error playing first text:', error); - } -} - -export async function cancel() { - const synth = window.speechSynthesis; - synth.cancel(); -} - -export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { - if (!(text?.trim())) return; - - try { - const synth = window.speechSynthesis; - const utterThis = new SpeechSynthesisUtterance(text); - const voices = synth.getVoices(); - voiceId = voiceId || getBrowseVoiceId(); - utterThis.voice = voices.find((voice) => voiceId === voice.name) || null; - synth.speak(utterThis); - } catch (error) { - // has happened once in months of testing, not sure what was the cause - console.error('EXPERIMENTAL_speakTextStream:', error); - } -} \ No newline at end of file diff --git a/src/modules/elevenlabs/elevenlabs.client.ts b/src/modules/elevenlabs/elevenlabs.client.ts deleted file mode 100644 index 9e7e5ed09..000000000 --- a/src/modules/elevenlabs/elevenlabs.client.ts +++ /dev/null @@ -1,98 +0,0 @@ -import { getBackendCapabilities } from '~/modules/backend/store-backend-capabilities'; - -import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer'; -import { AudioPlayer } from '~/common/util/audio/AudioPlayer'; -import { CapabilitySpeechSynthesis } from '~/common/components/useCapabilities'; -import { frontendSideFetch } from '~/common/util/clientFetchers'; -import { useUIPreferencesStore } from '~/common/state/store-ui'; - -import type { SpeechInputSchema } from './elevenlabs.router'; -import { getElevenLabsData, useElevenLabsData } from './store-module-elevenlabs'; - - -export const isValidElevenLabsApiKey = (apiKey?: string) => !!apiKey && apiKey.trim()?.length >= 32; - -export const isElevenLabsEnabled = (apiKey?: string) => apiKey - ? isValidElevenLabsApiKey(apiKey) - : getBackendCapabilities().hasVoiceElevenLabs; - - -export function useCapability(): CapabilitySpeechSynthesis { - const [clientApiKey, voiceId] = useElevenLabsData(); - const isConfiguredServerSide = getBackendCapabilities().hasVoiceElevenLabs; - const isConfiguredClientSide = clientApiKey ? isValidElevenLabsApiKey(clientApiKey) : false; - const mayWork = isConfiguredServerSide || isConfiguredClientSide || !!voiceId; - return { mayWork, isConfiguredServerSide, isConfiguredClientSide }; -} - - -export async function speakText(text: string, voiceId?: string) { - if (!(text?.trim())) return; - - const { elevenLabsApiKey, elevenLabsVoiceId } = getElevenLabsData(); - if (!isElevenLabsEnabled(elevenLabsApiKey)) return; - - const { preferredLanguage } = useUIPreferencesStore.getState(); - const nonEnglish = !(preferredLanguage?.toLowerCase()?.startsWith('en')); - - try { - const edgeResponse = await frontendFetchAPIElevenLabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, false); - const audioBuffer = await edgeResponse.arrayBuffer(); - await AudioPlayer.playBuffer(audioBuffer); - } catch (error) { - console.error('Error playing first text:', error); - } -} - -// let liveAudioPlayer: LiveAudioPlayer | undefined = undefined; - -export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { - if (!(text?.trim())) return; - - const { elevenLabsApiKey, elevenLabsVoiceId } = getElevenLabsData(); - if (!isElevenLabsEnabled(elevenLabsApiKey)) return; - - const { preferredLanguage } = useUIPreferencesStore.getState(); - const nonEnglish = !(preferredLanguage?.toLowerCase()?.startsWith('en')); - - try { - const edgeResponse = await frontendFetchAPIElevenLabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, true); - - // if (!liveAudioPlayer) - const liveAudioPlayer = new AudioLivePlayer(); - // fire/forget - void liveAudioPlayer.EXPERIMENTAL_playStream(edgeResponse); - - } catch (error) { - // has happened once in months of testing, not sure what was the cause - console.error('EXPERIMENTAL_speakTextStream:', error); - } -} - - -/** - * Note: we have to use this client-side API instead of TRPC because of ArrayBuffers.. - */ -async function frontendFetchAPIElevenLabsSpeech(text: string, elevenLabsApiKey: string, elevenLabsVoiceId: string, nonEnglish: boolean, streaming: boolean): Promise { - // NOTE: hardcoded 1000 as a failsafe, since the API will take very long and consume lots of credits for longer texts - const speechInput: SpeechInputSchema = { - elevenKey: elevenLabsApiKey, - text: text.slice(0, 1000), - voiceId: elevenLabsVoiceId, - nonEnglish, - ...(streaming && { streaming: true, streamOptimization: 4 }), - }; - - const response = await frontendSideFetch('/api/elevenlabs/speech', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify(speechInput), - }); - - if (!response.ok) { - const errorData = await response.json(); - throw new Error(errorData.error || errorData.message || 'Unknown error'); - } - - return response; -} \ No newline at end of file diff --git a/src/modules/tts/tts.client.hooks.ts b/src/modules/tts/tts.client.hooks.ts new file mode 100644 index 000000000..55c685729 --- /dev/null +++ b/src/modules/tts/tts.client.hooks.ts @@ -0,0 +1,11 @@ +import { getTTSEngine } from './useTTSStore'; +import { findTTSVendor } from './vendors/vendors.registry'; + +export function useTTSCapability() { + const TTSEngine = getTTSEngine(); + const vendor = findTTSVendor(TTSEngine); + if (!vendor) { + throw new Error(`No TTS Engine found for ${TTSEngine}`); + } + return vendor.getCapabilityInfo(); +} diff --git a/src/modules/tts/tts.client.ts b/src/modules/tts/tts.client.ts new file mode 100644 index 000000000..c96a09eca --- /dev/null +++ b/src/modules/tts/tts.client.ts @@ -0,0 +1,41 @@ +import { getTTSEngine } from './useTTSStore'; +import { findTTSVendor } from './vendors/vendors.registry'; + +export async function speakText(text: string, voiceId?: string) { + const TTSEngine = getTTSEngine(); + const vendor = findTTSVendor(TTSEngine); + if (!vendor) { + throw new Error(`No TTS Engine found for ${TTSEngine}`); + } + return vendor.speakText(text, voiceId); +} + +export async function EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { + const TTSEngine = getTTSEngine(); + const vendor = findTTSVendor(TTSEngine); + if (!vendor) { + throw new Error(`No TTS Engine found for ${TTSEngine}`); + } + return vendor.EXPERIMENTAL_speakTextStream(text, voiceId); +} + +export function cancel() { + const TTSEngine = getTTSEngine(); + const vendor = findTTSVendor(TTSEngine); + if (!vendor) { + throw new Error(`No TTS Engine found for ${TTSEngine}`); + } + if (!vendor.cancel) { + return; + } + return vendor.cancel(); +} + +export function getName() { + const TTSEngine = getTTSEngine(); + const vendor = findTTSVendor(TTSEngine); + if (!vendor) { + throw new Error(`No TTS Engine found for ${TTSEngine}`); + } + return vendor.name; +} \ No newline at end of file diff --git a/src/modules/tts/tts.setting.tsx b/src/modules/tts/tts.setting.tsx new file mode 100644 index 000000000..3bcb26d14 --- /dev/null +++ b/src/modules/tts/tts.setting.tsx @@ -0,0 +1,11 @@ +import { getTTSEngine } from './useTTSStore'; +import { findTTSVendor } from './vendors/vendors.registry'; + +export function TTSSetting() { + const TTSEngine = getTTSEngine(); + const vendor = findTTSVendor(TTSEngine); + if (!vendor || !vendor.TTSSettingsComponent) { + return <>; + } + return ; +} diff --git a/src/modules/tts/useTTSStore.ts b/src/modules/tts/useTTSStore.ts new file mode 100644 index 000000000..6100a65c0 --- /dev/null +++ b/src/modules/tts/useTTSStore.ts @@ -0,0 +1,34 @@ +import { create } from 'zustand'; +import { persist } from 'zustand/middleware'; +import { useShallow } from 'zustand/react/shallow'; + +export type TTSEngineKey = 'elevenlabs' | 'webspeech'; + +export const TTSEngineList: { key: TTSEngineKey; label: string }[] = [ + { + key: 'elevenlabs', + label: 'ElevenLabs', + }, + { + key: 'webspeech', + label: 'Web Speech API', + }, +]; + +interface TTSStore { + TTSEngine: TTSEngineKey; + setTTSEngine: (TTSEngine: TTSEngineKey) => void; +} + +const useTTSStore = create()( + persist( + (_set, _get) => ({ + TTSEngine: TTSEngineList[0].key, + setTTSEngine: (TTSEngine: TTSEngineKey) => _set({ TTSEngine }), + }), + { name: 'tts' }, + ), +); + +export const useTTSEngine = (): [TTSEngineKey, (TTSEngine: TTSEngineKey) => void] => useTTSStore(useShallow((state) => [state.TTSEngine, state.setTTSEngine])); +export const getTTSEngine = () => useTTSStore.getState().TTSEngine; diff --git a/src/modules/tts/vendors/ISpeechSynthesis.ts b/src/modules/tts/vendors/ISpeechSynthesis.ts new file mode 100644 index 000000000..8a40fca41 --- /dev/null +++ b/src/modules/tts/vendors/ISpeechSynthesis.ts @@ -0,0 +1,30 @@ +import type React from 'react'; + +import type { SvgIconProps } from '@mui/joy'; +import { TTSEngineKey } from './vendors.registry'; + +export interface ISpeechSynthesis<> { + readonly id: TTSEngineKey; + readonly name: string; + readonly location: 'local' | 'cloud'; + + // components + // readonly Icon: React.FunctionComponent; + readonly TTSSettingsComponent?: React.ComponentType; + + /// abstraction interface /// + + hasVoices?(): boolean; + getCapabilityInfo(): CapabilitySpeechSynthesis; + speakText(text: string, voiceId?: string): Promise; + EXPERIMENTAL_speakTextStream(text: string, voiceId?: string): Promise; + cancel?(): Promise; + stop?(): Promise; + resume?(): Promise; +} + +export interface CapabilitySpeechSynthesis { + mayWork: boolean; + isConfiguredServerSide: boolean; + isConfiguredClientSide: boolean; +} diff --git a/src/modules/elevenlabs/ElevenlabsSettings.tsx b/src/modules/tts/vendors/elevenlabs/ElevenlabsSettings.tsx similarity index 87% rename from src/modules/elevenlabs/ElevenlabsSettings.tsx rename to src/modules/tts/vendors/elevenlabs/ElevenlabsSettings.tsx index 97ebc64b3..5a93fc12f 100644 --- a/src/modules/elevenlabs/ElevenlabsSettings.tsx +++ b/src/modules/tts/vendors/elevenlabs/ElevenlabsSettings.tsx @@ -5,9 +5,8 @@ import { FormControl } from '@mui/joy'; import { AlreadySet } from '~/common/components/AlreadySet'; import { FormInputKey } from '~/common/components/forms/FormInputKey'; import { FormLabelStart } from '~/common/components/forms/FormLabelStart'; -import { useVoiceCapability } from '~/common/components/useCapabilities'; -import { isElevenLabsEnabled } from './elevenlabs.client'; +import { elevenlabs, isElevenLabsEnabled } from './elevenlabs.vendor'; import { useElevenLabsVoiceDropdown } from './useElevenLabsVoiceDropdown'; import { useElevenLabsApiKey } from './store-module-elevenlabs'; @@ -16,7 +15,7 @@ export function ElevenlabsSettings() { // external state const [apiKey, setApiKey] = useElevenLabsApiKey(); - const { isConfiguredServerSide } = useVoiceCapability(); + const { isConfiguredServerSide } = elevenlabs.getCapabilityInfo(); const { voicesDropdown } = useElevenLabsVoiceDropdown(true); diff --git a/src/modules/elevenlabs/elevenlabs.router.ts b/src/modules/tts/vendors/elevenlabs/elevenlabs.router.ts similarity index 100% rename from src/modules/elevenlabs/elevenlabs.router.ts rename to src/modules/tts/vendors/elevenlabs/elevenlabs.router.ts diff --git a/src/modules/elevenlabs/elevenlabs.server.ts b/src/modules/tts/vendors/elevenlabs/elevenlabs.server.ts similarity index 100% rename from src/modules/elevenlabs/elevenlabs.server.ts rename to src/modules/tts/vendors/elevenlabs/elevenlabs.server.ts diff --git a/src/modules/tts/vendors/elevenlabs/elevenlabs.vendor.ts b/src/modules/tts/vendors/elevenlabs/elevenlabs.vendor.ts new file mode 100644 index 000000000..46b1958e3 --- /dev/null +++ b/src/modules/tts/vendors/elevenlabs/elevenlabs.vendor.ts @@ -0,0 +1,107 @@ +import { getBackendCapabilities } from '~/modules/backend/store-backend-capabilities'; + +import { AudioLivePlayer } from '~/common/util/audio/AudioLivePlayer'; +import { AudioPlayer } from '~/common/util/audio/AudioPlayer'; +import { frontendSideFetch } from '~/common/util/clientFetchers'; +import { useUIPreferencesStore } from '~/common/state/store-ui'; + +import type { SpeechInputSchema } from './elevenlabs.router'; +import { getElevenLabsData, useElevenLabsData } from './store-module-elevenlabs'; +import { ElevenlabsSettings } from './ElevenlabsSettings'; +import { CapabilitySpeechSynthesis, ISpeechSynthesis } from '../ISpeechSynthesis'; + +const isValidElevenLabsApiKey = (apiKey?: string) => !!apiKey && apiKey.trim()?.length >= 32; + +export const isElevenLabsEnabled = (apiKey?: string) => (apiKey ? isValidElevenLabsApiKey(apiKey) : getBackendCapabilities().hasVoiceElevenLabs); + +/** + * Note: we have to use this client-side API instead of TRPC because of ArrayBuffers.. + */ +async function frontendFetchAPIElevenLabsSpeech( + text: string, + elevenLabsApiKey: string, + elevenLabsVoiceId: string, + nonEnglish: boolean, + streaming: boolean, +): Promise { + // NOTE: hardcoded 1000 as a failsafe, since the API will take very long and consume lots of credits for longer texts + const speechInput: SpeechInputSchema = { + elevenKey: elevenLabsApiKey, + text: text.slice(0, 1000), + voiceId: elevenLabsVoiceId, + nonEnglish, + ...(streaming && { streaming: true, streamOptimization: 4 }), + }; + + const response = await frontendSideFetch('/api/elevenlabs/speech', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(speechInput), + }); + + if (!response.ok) { + const errorData = await response.json(); + throw new Error(errorData.error || errorData.message || 'Unknown error'); + } + + return response; +} + +export const elevenlabs: ISpeechSynthesis = { + id: 'webspeech', + name: 'Web Speech API', + location: 'cloud', + + // components + TTSSettingsComponent: ElevenlabsSettings, + + // functions + getCapabilityInfo(): CapabilitySpeechSynthesis { + const {elevenLabsApiKey:clientApiKey, elevenLabsVoiceId:voiceId} = getElevenLabsData(); + const isConfiguredServerSide = getBackendCapabilities().hasVoiceElevenLabs; + const isConfiguredClientSide = clientApiKey ? isValidElevenLabsApiKey(clientApiKey) : false; + const mayWork = isConfiguredServerSide || isConfiguredClientSide || !!voiceId; + return { mayWork, isConfiguredServerSide, isConfiguredClientSide }; + }, + + async speakText(text: string, voiceId?: string) { + if (!text?.trim()) return; + + const { elevenLabsApiKey, elevenLabsVoiceId } = getElevenLabsData(); + if (!isElevenLabsEnabled(elevenLabsApiKey)) return; + + const { preferredLanguage } = useUIPreferencesStore.getState(); + const nonEnglish = !preferredLanguage?.toLowerCase()?.startsWith('en'); + + try { + const edgeResponse = await frontendFetchAPIElevenLabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, false); + const audioBuffer = await edgeResponse.arrayBuffer(); + await AudioPlayer.playBuffer(audioBuffer); + } catch (error) { + console.error('Error playing first text:', error); + } + }, + + // let liveAudioPlayer: LiveAudioPlayer | undefined = undefined; + async EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { + if (!text?.trim()) return; + + const { elevenLabsApiKey, elevenLabsVoiceId } = getElevenLabsData(); + if (!isElevenLabsEnabled(elevenLabsApiKey)) return; + + const { preferredLanguage } = useUIPreferencesStore.getState(); + const nonEnglish = !preferredLanguage?.toLowerCase()?.startsWith('en'); + + try { + const edgeResponse = await frontendFetchAPIElevenLabsSpeech(text, elevenLabsApiKey, voiceId || elevenLabsVoiceId, nonEnglish, true); + + // if (!liveAudioPlayer) + const liveAudioPlayer = new AudioLivePlayer(); + // fire/forget + void liveAudioPlayer.EXPERIMENTAL_playStream(edgeResponse); + } catch (error) { + // has happened once in months of testing, not sure what was the cause + console.error('EXPERIMENTAL_speakTextStream:', error); + } + }, +}; diff --git a/src/modules/elevenlabs/store-module-elevenlabs.ts b/src/modules/tts/vendors/elevenlabs/store-module-elevenlabs.ts similarity index 100% rename from src/modules/elevenlabs/store-module-elevenlabs.ts rename to src/modules/tts/vendors/elevenlabs/store-module-elevenlabs.ts diff --git a/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx b/src/modules/tts/vendors/elevenlabs/useElevenLabsVoiceDropdown.tsx similarity index 98% rename from src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx rename to src/modules/tts/vendors/elevenlabs/useElevenLabsVoiceDropdown.tsx index 24de0b003..9b2bb0fa4 100644 --- a/src/modules/elevenlabs/useElevenLabsVoiceDropdown.tsx +++ b/src/modules/tts/vendors/elevenlabs/useElevenLabsVoiceDropdown.tsx @@ -8,7 +8,7 @@ import { AudioPlayer } from '~/common/util/audio/AudioPlayer'; import { apiQuery } from '~/common/util/trpc.client'; import { VoiceSchema } from './elevenlabs.router'; -import { isElevenLabsEnabled } from './elevenlabs.client'; +import { isElevenLabsEnabled } from './elevenlabs.vendor'; import { useElevenLabsApiKey, useElevenLabsVoiceId } from './store-module-elevenlabs'; diff --git a/src/modules/tts/vendors/vendors.registry.ts b/src/modules/tts/vendors/vendors.registry.ts new file mode 100644 index 000000000..75319650f --- /dev/null +++ b/src/modules/tts/vendors/vendors.registry.ts @@ -0,0 +1,19 @@ +import { TTSEngineKey } from '../useTTSStore'; +import { elevenlabs } from './elevenlabs/elevenlabs.vendor'; +import { ISpeechSynthesis } from './ISpeechSynthesis'; +import { webspeech } from './webspeech/webspeech.vendor'; + +/** Global: Vendor Instances Registry **/ +const MODEL_VENDOR_REGISTRY: Record = { + elevenlabs:elevenlabs, + webspeech:webspeech, +} as Record; + +export function findAllTTSVendors(): ISpeechSynthesis[] { + const modelVendors = Object.values(MODEL_VENDOR_REGISTRY); + return modelVendors; +} + +export function findTTSVendor(TTSEngineKey?: TTSEngineKey): ISpeechSynthesis | null { + return TTSEngineKey ? ((MODEL_VENDOR_REGISTRY[TTSEngineKey] as ISpeechSynthesis) ?? null) : null; +} diff --git a/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx b/src/modules/tts/vendors/webspeech/WebspeechSettings.tsx similarity index 95% rename from src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx rename to src/modules/tts/vendors/webspeech/WebspeechSettings.tsx index 4a6fc6441..d4c4cf801 100644 --- a/src/modules/browser/speech-synthesis/BrowserSpeechSettings.tsx +++ b/src/modules/tts/vendors/webspeech/WebspeechSettings.tsx @@ -6,13 +6,13 @@ import CloseRounded from '@mui/icons-material/CloseRounded'; import { addSnackbar } from '~/common/components/snackbar/useSnackbarsStore'; import { FormLabelStart } from '~/common/components/forms/FormLabelStart'; -import { useBrowserSpeechVoiceDropdown } from './useBrowserSpeechVoiceDropdown'; -import { useLanguageCodeForFilter } from './store-module-browser'; +import { useBrowserSpeechVoiceDropdown } from './useWebspeechVoiceDropdown'; +import { useLanguageCodeForFilter } from './store-module-webspeech'; // languages are defined as a JSON file import languages from './preSelect/Languages.json'; -export function BrowserSpeechSettings() { +export function WebspeechSettings() { // state const [testUtterance, setTestUtterance] = React.useState(null); const [voiceNameFilters, setVoiceNameFilters] = React.useState(null); diff --git a/src/modules/browser/speech-synthesis/preSelect/Languages.json b/src/modules/tts/vendors/webspeech/preSelect/Languages.json similarity index 100% rename from src/modules/browser/speech-synthesis/preSelect/Languages.json rename to src/modules/tts/vendors/webspeech/preSelect/Languages.json diff --git a/src/modules/browser/speech-synthesis/store-module-browser.tsx b/src/modules/tts/vendors/webspeech/store-module-webspeech.ts similarity index 100% rename from src/modules/browser/speech-synthesis/store-module-browser.tsx rename to src/modules/tts/vendors/webspeech/store-module-webspeech.ts diff --git a/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx b/src/modules/tts/vendors/webspeech/useWebspeechVoiceDropdown.tsx similarity index 96% rename from src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx rename to src/modules/tts/vendors/webspeech/useWebspeechVoiceDropdown.tsx index 9db7d462e..c0ff014ea 100644 --- a/src/modules/browser/speech-synthesis/useBrowserSpeechVoiceDropdown.tsx +++ b/src/modules/tts/vendors/webspeech/useWebspeechVoiceDropdown.tsx @@ -4,8 +4,8 @@ import { CircularProgress, Option, Select } from '@mui/joy'; import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown'; import RecordVoiceOverTwoToneIcon from '@mui/icons-material/RecordVoiceOverTwoTone'; -import { useBrowseVoiceId } from './store-module-browser'; -import { speakText, cancel } from './browser.speechSynthesis.client'; +import { useBrowseVoiceId } from './store-module-webspeech'; +import { speakText, cancel } from '../../tts.client'; function VoicesDropdown(props: { isValidKey: boolean; diff --git a/src/modules/tts/vendors/webspeech/webspeech.vendor.ts b/src/modules/tts/vendors/webspeech/webspeech.vendor.ts new file mode 100644 index 000000000..9242ab50c --- /dev/null +++ b/src/modules/tts/vendors/webspeech/webspeech.vendor.ts @@ -0,0 +1,65 @@ +import { getBrowseVoiceId } from './store-module-webspeech'; +import { CapabilitySpeechSynthesis, ISpeechSynthesis } from '../ISpeechSynthesis'; +import { WebspeechSettings } from './WebspeechSettings'; + +export const webspeech: ISpeechSynthesis = { + id: 'webspeech', + name: 'Web Speech API', + location: 'cloud', + + // components + TTSSettingsComponent: WebspeechSettings, + + // functions + + getCapabilityInfo(): CapabilitySpeechSynthesis { + const synth = window.speechSynthesis; + const voices = synth.getVoices(); + const isConfiguredServerSide = false; + const isConfiguredClientSide = true; + const mayWork = voices.length > 0; + return { mayWork, isConfiguredServerSide, isConfiguredClientSide }; + }, + + hasVoices() { + const synth = window.speechSynthesis; + const voices = synth.getVoices(); + return voices.length > 0; + }, + + async speakText(text: string, voiceId?: string) { + if (!text?.trim()) return; + + try { + const synth = window.speechSynthesis; + const utterThis = new SpeechSynthesisUtterance(text); + const voices = synth.getVoices(); + voiceId = voiceId || getBrowseVoiceId(); + utterThis.voice = voices.find((voice) => voiceId === voice.name) || null; + synth.speak(utterThis); + } catch (error) { + console.error('Error playing first text:', error); + } + }, + + async cancel() { + const synth = window.speechSynthesis; + synth.cancel(); + }, + + async EXPERIMENTAL_speakTextStream(text: string, voiceId?: string) { + if (!text?.trim()) return; + + try { + const synth = window.speechSynthesis; + const utterThis = new SpeechSynthesisUtterance(text); + const voices = synth.getVoices(); + voiceId = voiceId || getBrowseVoiceId(); + utterThis.voice = voices.find((voice) => voiceId === voice.name) || null; + synth.speak(utterThis); + } catch (error) { + // has happened once in months of testing, not sure what was the cause + console.error('EXPERIMENTAL_speakTextStream:', error); + } + }, +}; diff --git a/src/server/api/trpc.router-edge.ts b/src/server/api/trpc.router-edge.ts index 54fce437b..e2d3fac67 100644 --- a/src/server/api/trpc.router-edge.ts +++ b/src/server/api/trpc.router-edge.ts @@ -2,7 +2,7 @@ import { createTRPCRouter } from './trpc.server'; import { aixRouter } from '~/modules/aix/server/api/aix.router'; import { backendRouter } from '~/modules/backend/backend.router'; -import { elevenlabsRouter } from '~/modules/elevenlabs/elevenlabs.router'; +import { elevenlabsRouter } from '~/modules/tts/vendors/elevenlabs/elevenlabs.router'; import { googleSearchRouter } from '~/modules/google/search.router'; import { llmAnthropicRouter } from '~/modules/llms/server/anthropic/anthropic.router'; import { llmGeminiRouter } from '~/modules/llms/server/gemini/gemini.router';